From 86fb87ec8a0c934e6dc8c38bc936e262bdc0009e Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Thu, 10 Feb 2022 21:52:33 +0000
Subject: [PATCH 01/43] remove scriptsdir

---
 cgatpipelines/tools/pipeline_exome.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 30e3adf3a..71f09acfe 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -212,7 +212,6 @@ def getGATKOptions():
 @files(PARAMS["roi_bed"], "roi.load")
 def loadROI(infile, outfile):
     '''Import regions of interest bed file into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     header = "chr,start,stop,feature"
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
@@ -231,7 +230,6 @@ def loadROI(infile, outfile):
 @files(PARAMS["roi_to_gene"], "roi2gene.load")
 def loadROI2Gene(infile, outfile):
     '''Import genes mapping to regions of interest bed file into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
             | cgat csv2db %(csv2db_options)s
@@ -248,7 +246,6 @@ def loadROI2Gene(infile, outfile):
 @files(PARAMS["samples"], "samples.load")
 def loadSamples(infile, outfile):
     '''Import sample information into SQLite.'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     tablename = P.to_table(outfile)
     statement = '''cat %(infile)s
             | cgat csv2db %(csv2db_options)s
@@ -2006,7 +2003,6 @@ def buildVCFstats(infile, outfile):
 @merge(buildVCFstats, "vcf_stats.load")
 def loadVCFstats(infiles, outfile):
     '''Import variant statistics into SQLite'''
-    scriptsdir = PARAMS["general_scriptsdir"]
     filenames = " ".join(infiles)
     tablename = P.to_table(outfile)
     E.info("Loading vcf stats...")

From b90a82fa2e2214119bbf9370b4042e95a6611492 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Thu, 10 Feb 2022 21:57:24 +0000
Subject: [PATCH 02/43] remove scriptsdir

---
 .../tools/pipeline_exome/pipeline.yml         | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 1d8158e1c..d55ce7e27 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -4,16 +4,11 @@
 ## Exome pipeline parameters
 ##########################################################
 
-
-
-# Script directory
-scriptsdir: /ifs/devel/katherineb/cgat/scripts
-
 # the genome to use
-genome: hg19
+genome: hg38
 
 # location of indexed genome for SAMtools
-genome_dir: /ifs/mirror/genomes/gatk
+genome_dir: /path/to/genome/dir
 
 ########################################################
 # directory where exported data is located
@@ -27,18 +22,18 @@ web_dir: ../web
 # If you are not running exome data but targetted sequencing on a smaller region then change to 1
 targetted: 0
 
-#pedfile=/ifs/projects/proj024/analysis/illumina_fam8-16/fastq_decon_o5_filt_qc/mapping_human_g1k/samples.ped
+#pedfile=/path/to/pedfile
 
 # seed for random number generation
 randomseed: 1000
 
 # gtf file containing exon annotations
-geneset: /ifs/mirror/annotations/hg19_ensembl75_hierarchical/ensembl.dir/geneset_all.gtf.gz
+geneset: /path/to/ensembl.dir/geneset_all.gtf.gz
 
 bwa:
 
     # location od BWA indexed genome
-    index_dir: /ifs/mirror/genomes/gatk
+    index_dir: /path/to/genome
 
     # threads
     threads: 8
@@ -70,15 +65,15 @@ gatk:
 
     threads: 8
 
-    dbsnp: /ifs/mirror/genomes/gatk/dbsnp_138.hg19.vcf
+    dbsnp: /path/to/dbsnp_138.hg19.vcf
 
-    hapmap: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf
+    hapmap: /path/to/hapmap_3.3.hg19.sites.vcf
 
-    omni: /ifs/mirror/genomes/gatk/1000G_omni2.5.hg19.sites.vcf
+    omni: /path/to/omni/1000G_omni2.5.hg19.sites.vcf
 
-    kgenomes: /ifs/mirror/genomes/gatk/1000G_phase1.snps.high_confidence.hg19.sites.vcf
+    kgenomes: /path/to/1000G_phase1.snps.high_confidence.hg19.sites.vcf
 
-    mills: /ifs/mirror/genomes/gatk/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
+    mills: /path/to/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
 
     solid_options: ''
 
@@ -92,7 +87,7 @@ gatk:
 
 hapmap:
 
-    loc: /ifs/mirror/hapmap/genotype_frequencies
+    loc: /path/to/hapmap/genotype_frequencies
 
     # bgzip compressede and tabix indexed VCF file for Hapmap
     vcf: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf
@@ -227,7 +222,7 @@ coverage:
     candidates: ''
 
     # path to r script for generating the coverage plots
-    rscript: /ifs/projects/proj024/analysis/GRIN1_coverage/coverage.r
+    rscript: /path/to/coverage.r
 
     # the minimum read depth for counting a base as covered
     threshold: 15

From 482a29460e6bf1c1feef4f1423cb0b350175ec00 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:07:09 +0000
Subject: [PATCH 03/43] substatial changes to exome pipeline - bringing it in
 line with pipeline bamstats and changes in gatk

---
 cgatpipelines/tasks/exome.py                  |  53 ++++-----
 cgatpipelines/tools/pipeline_exome.py         | 111 ++++++++++--------
 .../tools/pipeline_exome/pipeline.yml         |   5 +
 3 files changed, 89 insertions(+), 80 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 6a0cb662e..3b7b61059 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -27,11 +27,6 @@
 # Set PARAMS in calling module
 PARAMS = {}
 
-
-def getGATKOptions():
-    return "-l mem_free=1.4G"
-
-
 def makeSoup(address):
     sock = urlopen(address)
     htmlSource = sock.read()
@@ -41,23 +36,23 @@ def makeSoup(address):
 ##############################################################################
 
 
-def GATKReadGroups(infile, outfile, genome,
+def GATKReadGroups(infile, outfile, dictionary,
                    library="unknown", platform="Illumina",
-                   platform_unit="1", track="unknown"):
+                   platform_unit="1", track="unknown", gatkmem="2G"):
     '''Reorders BAM according to reference fasta and adds read groups'''
 
     if track == 'unknown':
         track = P.snip(os.path.basename(infile), ".bam")
     tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''picard ReorderSam
-                    INPUT=%(infile)s
-                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
-                    REFERENCE=%(genome)s
-                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
-                    VALIDATION_STRINGENCY=SILENT ;''' % locals()
+                    -INPUT=%(infile)s
+                    -OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
+                    -SEQUENCE_DICTIONARY=%(dictionary)s
+                    -ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
+                    -VALIDATION_STRINGENCY=SILENT ;''' % locals()
 
     statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                  ''' % locals()
@@ -80,11 +75,11 @@ def GATKReadGroups(infile, outfile, genome,
 ##############################################################################
 
 
-def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4):
+def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4, gatkmem="2G"):
     '''Realigns BAMs around indels using GATK'''
 
     intervalfile = outfile.replace(".bam", ".intervals")
-    job_options = getGATKOptions()
+    job_memory = gatkmem 
     job_threads = 3
 
     statement = '''GenomeAnalysisTK
@@ -108,12 +103,12 @@ def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4):
 
 
 def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
-                  solid_options=""):
+                  solid_options="", gatkmem="2G"):
     '''Recalibrates base quality scores using GATK'''
 
     track = P.snip(os.path.basename(infile), ".bam")
     tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''GenomeAnalysisTK
@@ -140,10 +135,10 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
 
 
 def haplotypeCaller(infile, outfile, genome,
-                    dbsnp, intervals, padding, options):
+                    dbsnp, intervals, padding, options, gatkmem = "2G"):
     '''Call SNVs and indels using GATK HaplotypeCaller in all members of a
     family together'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''GenomeAnalysisTK
@@ -164,9 +159,9 @@ def haplotypeCaller(infile, outfile, genome,
 ##############################################################################
 
 
-def genotypeGVCFs(inputfiles, outfile, genome, options):
+def genotypeGVCFs(inputfiles, outfile, genome, options, gatkmem="2G"):
     '''Joint genotyping of all samples together'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''GenomeAnalysisTK
@@ -246,9 +241,9 @@ def strelkaINDELCaller(infile_control, infile_tumor, outfile, genome, config,
 
 
 def variantAnnotator(vcffile, bamlist, outfile, genome,
-                     dbsnp, annotations="", snpeff_file=""):
+                     dbsnp, annotations="", snpeff_file="", gatkmem="2G"):
     '''Annotate variant file using GATK VariantAnnotator'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     if "--useAllAnnotations" in annotations:
@@ -274,9 +269,9 @@ def variantAnnotator(vcffile, bamlist, outfile, genome,
 
 
 def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
-                        kgenomes=None, hapmap=None, omni=None, mills=None):
+                        kgenomes=None, hapmap=None, omni=None, mills=None, gatkmem = "2G"):
     '''Create variant recalibration file'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     track = P.snip(outfile, ".recal")
@@ -314,9 +309,9 @@ def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
 ##############################################################################
 
 
-def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode):
+def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode, gatkmem):
     '''Perform variant quality score recalibration using GATK '''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''GenomeAnalysisTK -T ApplyRecalibration
@@ -332,9 +327,9 @@ def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode):
 ##############################################################################
 
 
-def vcfToTable(infile, outfile, genome, columns):
+def vcfToTable(infile, outfile, genome, columns, gatkmem):
     '''Converts vcf to tab-delimited file'''
-    job_options = getGATKOptions()
+    job_memory = gatkmem
     job_threads = 3
 
     statement = '''GenomeAnalysisTK -T VariantsToTable
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 71f09acfe..1e170c00e 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -177,7 +177,7 @@
 import cgatcore.iotools as iotools
 from cgatcore import pipeline as P
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.exome as exome
 import cgatpipelines.tasks.exomeancestry as exomeancestry
 
@@ -196,9 +196,10 @@
 matches = glob.glob("*.fastq.1.gz") + glob.glob(
     "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz")
 
+PICARD_MEMORY = PARAMS["picard_memory"]
+
+GATK_MEMORY = PARAMS["gatk_memory"]
 
-def getGATKOptions():
-    return "-l mem_free=1.4G"
 
 ###############################################################################
 ###############################################################################
@@ -255,6 +256,17 @@ def loadSamples(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
+@originate("genome.dict")
+def createSequenceDictionary(outfile):
+    '''creates a sequence dictionary required for running
+    gatk/picard tools'''
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+
+    statement = '''picard CreateSequenceDictionary 
+                        -R %(genome_file)s -O %(outfile)s'''
+    P.run(statement)
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
@@ -280,8 +292,8 @@ def mapReads(infiles, outfile):
 @transform(mapReads, regex(r"bam/(\S+).bam"), r"bam/\1.picard_stats")
 def PicardAlignStats(infile, outfile):
     '''Run Picard CollectMultipleMetrics on each BAM file'''
-    genome = PARAMS["bwa_index_dir"] + "/" + PARAMS["genome"] + ".fa"
-    mappingqc.buildPicardAlignmentStats(infile, outfile, genome)
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY)
 
 ###############################################################################
 
@@ -290,7 +302,7 @@ def PicardAlignStats(infile, outfile):
 @merge(PicardAlignStats, "picard_stats.load")
 def loadPicardAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 ###############################################################################
 ###############################################################################
@@ -299,27 +311,26 @@ def loadPicardAlignStats(infiles, outfile):
 
 
 @follows(loadPicardAlignStats, mkdir("gatk"))
-@transform(mapReads, regex(r"bam/(\S+).bam"), r"gatk/\1.readgroups.bam")
-def GATKReadGroups(infile, outfile):
+
+
+
+@transform(mapReads, regex(r"bam/(\S+).bam"),
+           add_inputs(r"gatk/\1.merged.bam.count"),
+           r"gatk/\1.readgroups.bam")
+def GATKReadGroups(infile, dictionary, outfile):
     '''Reorders BAM according to reference fasta and adds read groups using
     GATK'''
-    '''Reorders BAM according to reference fasta and add read groups using
-    SAMtools, realigns around indels and recalibrates base quality
-    scores using GATK
-
-    '''
 
     track = re.sub(r'-\w+-\w+\.bam', '', os.path.basename(infile))
     tmpdir_gatk = P.get_temp_dir('.')
-    job_options = getGATKOptions()
     job_threads = PARAMS["gatk_threads"]
     library = PARAMS["readgroup_library"]
     platform = PARAMS["readgroup_platform"]
     platform_unit = PARAMS["readgroup_platform_unit"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    exome.GATKReadGroups(infile, outfile, genome,
+    exome.GATKReadGroups(infile, outfile, dictionary,
                                  library, platform,
-                                 platform_unit, track)
+                                 platform_unit, track,
+                                 GATK_MEMORY)
     iotools.zap_file(infile)
 
 ###############################################################################
@@ -333,7 +344,7 @@ def GATKReadGroups(infile, outfile):
            r"gatk/\1.dedup.bam")
 def RemoveDuplicatesLane(infile, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.buildPicardDuplicateStats(infile, outfile)
+    bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY)
     iotools.zap_file(infile)
 
 ###############################################################################
@@ -343,7 +354,7 @@ def RemoveDuplicatesLane(infile, outfile):
 @merge(RemoveDuplicatesLane, "picard_duplicate_stats_lane.load")
 def loadPicardDuplicateStatsLane(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 ###############################################################################
 
@@ -358,7 +369,7 @@ def GATKIndelRealignLane(infile, outfile):
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
     exome.GATKIndelRealign(infile, outfile, genome, intervals, padding,
-                                   threads)
+                                   threads, GATK_MEMORY)
     iotools.zap_file(infile)
 
 ###############################################################################
@@ -381,7 +392,8 @@ def GATKBaseRecal(infile, outfile):
         shutil.copyfile(intrack + ".bai", outtrack + ".bai")
     else:
         exome.GATKBaseRecal(infile, outfile, genome, intervals,
-                                    padding, dbsnp, solid_options)
+                                    padding, dbsnp, solid_options,
+                                    GATK_MEMORY)
     iotools.zap_file(infile)
 
 ###############################################################################
@@ -425,7 +437,7 @@ def RemoveDuplicatesSample(infiles, outfile):
     infile, countfile = infiles
     countf = open(countfile, "r")
     if countf.read() != '1':
-        mappingqc.buildPicardDuplicateStats(infile, outfile)
+        bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY)
     else:
         shutil.copyfile(infile, outfile)
         shutil.copyfile(infile + ".bai", outfile + ".bai")
@@ -439,7 +451,7 @@ def RemoveDuplicatesSample(infiles, outfile):
 @merge(RemoveDuplicatesSample, "picard_duplicate_stats_sample.load")
 def loadPicardDuplicateStatsSample(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 ###############################################################################
 ###############################################################################
@@ -461,7 +473,7 @@ def GATKIndelRealignSample(infiles, outfile):
     countf = open(countfile, "r")
     if countf.read() > '1':
         exome.GATKIndelRealign(infiles[0], outfile, genome, intervals,
-                                       padding, threads)
+                                       padding, threads, GATK_MEMORY)
     else:
         shutil.copyfile(infile, outfile)
         shutil.copyfile(infile + ".bai", outfile + ".bai")
@@ -481,8 +493,8 @@ def buildCoverageStats(infile, outfile):
     file using Picard'''
     baits = PARAMS["roi_baits"]
     regions = PARAMS["roi_regions"]
-    mappingqc.buildPicardCoverageStats(infile, outfile,
-                                               baits, regions)
+    bamstats.buildPicardCoverageStats(infile, outfile,
+                                               baits, regions, PICARD_MEMORY)
 
 ###############################################################################
 
@@ -491,7 +503,7 @@ def buildCoverageStats(infile, outfile):
 @merge(buildCoverageStats, "coverage_stats.load")
 def loadCoverageStats(infiles, outfile):
     '''Import coverage statistics into SQLite'''
-    mappingqc.loadPicardCoverageStats(infiles, outfile)
+    bamstats.loadPicardCoverageStats(infiles, outfile)
 
 ###############################################################################
 ###############################################################################
@@ -544,15 +556,14 @@ def loadXYRatio(infile, outfile):
            r"variants/\1.haplotypeCaller.g.vcf")
 def haplotypeCaller(infile, outfile):
     '''Call SNVs and indels using GATK HaplotypeCaller in individuals'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    job_options = getGATKOptions()
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     job_threads = PARAMS["gatk_threads"]
     dbsnp = PARAMS["gatk_dbsnp"]
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
     options = PARAMS["gatk_hc_options"]
     exome.haplotypeCaller(infile, outfile, genome, dbsnp,
-                                  intervals, padding, options)
+                                  intervals, padding, options, GATK_MEMORY)
 
 ###############################################################################
 
@@ -755,9 +766,8 @@ def variantAnnotator(infiles, outfile):
            r"variants/all_samples.snp_vqsr.recal")
 def variantRecalibratorSnps(infile, outfile):
     '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     dbsnp = PARAMS["gatk_dbsnp"]
-    job_options = getGATKOptions()
     job_threads = PARAMS["gatk_threads"]
     track = P.snip(outfile, ".recal")
     kgenomes = PARAMS["gatk_kgenomes"]
@@ -765,7 +775,7 @@ def variantRecalibratorSnps(infile, outfile):
     omni = PARAMS["gatk_omni"]
     mode = 'SNP'
     exome.variantRecalibrator(infile, outfile, genome, mode, dbsnp,
-                                      kgenomes, hapmap, omni)
+                                      kgenomes, hapmap, omni, GATK_MEMORY)
 
 ###############################################################################
 
@@ -779,10 +789,10 @@ def variantRecalibratorSnps(infile, outfile):
 def applyVariantRecalibrationSnps(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     mode = 'SNP'
     exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode)
+                                            outfile, genome, mode, GATK_MEMORY)
 
 ###############################################################################
 # Indel recalibration
@@ -793,15 +803,14 @@ def applyVariantRecalibrationSnps(infiles, outfile):
            r"variants/all_samples.vqsr.recal")
 def variantRecalibratorIndels(infile, outfile):
     '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    job_options = getGATKOptions()
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     job_threads = PARAMS["gatk_threads"]
     track = P.snip(outfile, ".recal")
     mills = PARAMS["gatk_mills"]
     dbsnp = PARAMS["gatk_dbsnp"]
     mode = 'INDEL'
     exome.variantRecalibrator(infile, outfile, genome, mode,
-                                      mills=mills)
+                                      mills=mills, gatkmem=GATK_MEMORY)
 
 ###############################################################################
 
@@ -818,7 +827,7 @@ def applyVariantRecalibrationIndels(infiles, outfile):
     genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
     mode = 'INDEL'
     exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode)
+                                            outfile, genome, mode, GATK_MEMORY)
 
 
 ###############################################################################
@@ -829,7 +838,7 @@ def applyVariantRecalibrationIndels(infiles, outfile):
            r"variants/all_samples_dbnsfp.snpsift.vcf")
 def annotateVariantsDBNSFP(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     dbNSFP = PARAMS["annotation_dbnsfp"]
     dbN_annotators = PARAMS["annotation_dbnsfpannotators"]
@@ -849,7 +858,7 @@ def annotateVariantsDBNSFP(infile, outfile):
            r"variants/all_samples_clinvar.snpsift.vcf")
 def annotateVariantsClinvar(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     clinvar = PARAMS["annotation_clinvar"]
     intout = outfile.replace("samples", "samples_clinvar")
@@ -863,7 +872,7 @@ def annotateVariantsClinvar(infile, outfile):
            r"variants/all_samples_exac.snpsift.vcf")
 def annotateVariantsExAC(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     exac = PARAMS["annotation_exac"]
     statement = """SnpSift.sh annotate
@@ -877,7 +886,7 @@ def annotateVariantsExAC(infile, outfile):
            r"variants/all_samples_gwasc.snpsift.vcf")
 def annotateVariantsGWASC(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
 
     gwas_catalog = PARAMS["annotation_gwas_catalog"]
@@ -891,7 +900,7 @@ def annotateVariantsGWASC(infile, outfile):
            r"variants/all_samples_phastcons.snpsift.vcf")
 def annotateVariantsPhastcons(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     genomeind = "%s/%s.fa.fai" % (
         PARAMS['general_genome_dir'],
@@ -909,7 +918,7 @@ def annotateVariantsPhastcons(infile, outfile):
            r"variants/all_samples_1000G.snpsift.vcf")
 def annotateVariants1000G(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
 
     vcfs = []
@@ -941,7 +950,7 @@ def annotateVariants1000G(infile, outfile):
            r"variants/all_samples_dbsnp.snpsift.vcf")
 def annotateVariantsDBSNP(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
 
     dbsnp = PARAMS["annotation_dbsnp"]
@@ -967,7 +976,7 @@ def annotateVariantsVEP(infile, outfile):
     '''
     # infile - VCF
     # outfile - VCF with vep annotations
-    job_options = "-l mem_free=6G"
+    job_memory = "6G"
     job_threads = 4
 
     VEP = PARAMS["annotation_vepannotators"].split(",")
@@ -1565,8 +1574,8 @@ def confirmParentage(infiles, outfile):
            r"variants/\1.filtered.vcf")
 def deNovoVariants(infiles, outfile):
     '''Filter de novo variants based on provided jexl expression'''
-    job_options = getGATKOptions()
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    job_memory = GATK_MEMORY
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     pedfile, infile = infiles
     pedigree = csv.DictReader(
         iotools.open_file(pedfile), delimiter='\t', fieldnames=[
@@ -1585,7 +1594,7 @@ def deNovoVariants(infiles, outfile):
            r"variants/\1.filtered.table")
 def tabulateDeNovos(infile, outfile):
     '''Tabulate de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1610,7 +1619,7 @@ def loadDeNovos(infile, outfile):
 def lowerStringencyDeNovos(infiles, outfile):
     '''Filter lower stringency de novo variants based on provided jexl
     expression'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     pedfile, infile = infiles
     pedigree = csv.DictReader(
         iotools.open_file(pedfile), delimiter='\t', fieldnames=[
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index d55ce7e27..64754db4f 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -30,6 +30,9 @@ randomseed: 1000
 # gtf file containing exon annotations
 geneset: /path/to/ensembl.dir/geneset_all.gtf.gz
 
+#picard memory
+picard_memory: 9G
+
 bwa:
 
     # location od BWA indexed genome
@@ -65,6 +68,8 @@ gatk:
 
     threads: 8
 
+    memory: 2G
+
     dbsnp: /path/to/dbsnp_138.hg19.vcf
 
     hapmap: /path/to/hapmap_3.3.hg19.sites.vcf

From 40fb76ecba40c1163b119ed201fa079969035063 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:23:23 +0000
Subject: [PATCH 04/43] remove mappingqc from pipeline_mapping.py

---
 cgatpipelines/tools/pipeline_mapping.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_mapping.py b/cgatpipelines/tools/pipeline_mapping.py
index 23400364b..a2479f84d 100644
--- a/cgatpipelines/tools/pipeline_mapping.py
+++ b/cgatpipelines/tools/pipeline_mapping.py
@@ -225,7 +225,6 @@
 import cgat.BamTools.bamtools as BamTools
 import cgatpipelines.tasks.geneset as geneset
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
 
 # Pipeline configuration
 P.get_parameters(
@@ -247,7 +246,6 @@
     restrict_interface=True))
 
 geneset.PARAMS = PARAMS
-mappingqc.PARAMS = PARAMS
 
 # Helper functions mapping tracks to conditions, etc
 # determine the location of the input files (reads).

From bfbb604a40d67b1e6cf7f7c6627496f18b742708 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:27:52 +0000
Subject: [PATCH 05/43] remove mappingqc from pipeline_windows.py

---
 cgatpipelines/tools/pipeline_windows.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_windows.py b/cgatpipelines/tools/pipeline_windows.py
index 108f44e6f..a09ba435f 100644
--- a/cgatpipelines/tools/pipeline_windows.py
+++ b/cgatpipelines/tools/pipeline_windows.py
@@ -120,7 +120,7 @@
 from cgatcore import pipeline as P
 import cgatpipelines.tasks.windows as windows
 import cgatpipelines.tasks.tracks as tracks
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 
 
 #########################################################################
@@ -210,13 +210,12 @@ def loadTagCounts(infiles, outfile):
                      suffix=".tsv")
 
 
-# @P.add_doc(mappingqc.loadPicardDuplicateStats)
 @merge(prepareTags, "picard_duplicates.load")
 def loadPicardDuplicateStats(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite
     table picard_duplicates.
     '''
-    mappingqc.loadPicardDuplicateStats(
+    bamstats.loadPicardDuplicateStats(
         infiles, outfile, pipeline_suffix=".bed.gz")
 
 

From f2e54519ae459e013b87c9a5d7967f6253884958 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:36:08 +0000
Subject: [PATCH 06/43] remove mappingqc from pipeline peakcalling

---
 cgatpipelines/tools/pipeline_peakcalling.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_peakcalling.py b/cgatpipelines/tools/pipeline_peakcalling.py
index 423fd9afe..442dd2f1e 100644
--- a/cgatpipelines/tools/pipeline_peakcalling.py
+++ b/cgatpipelines/tools/pipeline_peakcalling.py
@@ -248,7 +248,7 @@
 import cgatcore.experiment as E
 import cgatcore.iotools as iotools
 from cgatcore import pipeline as P
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.peakcalling as peakcalling
 import cgat.BamTools.bamtools as Bamtools
 import cgatcore.database as DB
@@ -522,7 +522,7 @@ def loadIdxstats(infiles, outfile):
         idxstats output. Filename format is expected to be 'sample.idxstats'
     outfile : string
         Logfile. The table name will be derived from `outfile`.'''
-    mappingqc.loadIdxstats(infiles, outfile)
+    bamstats.loadIdxstats(infiles, outfile)
 
 
 @transform((filterChipBAMs, filterInputBAMs),
@@ -533,9 +533,10 @@ def buildPicardStats(infiles, outfile):
     infile = infiles[0]
     reffile = os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa")
 
-    mappingqc.buildPicardAlignmentStats(infile,
-                                                outfile,
-                                                reffile)
+    bamstats.buildPicardAlignmentStats(infile,
+                                       outfile,
+                                       reffile,
+                                       picardmem="9G")
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -543,7 +544,7 @@ def buildPicardStats(infiles, outfile):
 def loadPicardStats(infiles, outfile):
     '''merge alignment stats into single tables.
     '''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 
 @follows(loadFilteringStats,

From 37f5213acf0799d95509bc105d7160cdbbee8624 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:40:43 +0000
Subject: [PATCH 07/43] remove mappingqc from rnaseqqc pipelin

---
 cgatpipelines/tools/pipeline_rnaseqqc.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_rnaseqqc.py b/cgatpipelines/tools/pipeline_rnaseqqc.py
index 3b9220396..6cced4a81 100644
--- a/cgatpipelines/tools/pipeline_rnaseqqc.py
+++ b/cgatpipelines/tools/pipeline_rnaseqqc.py
@@ -193,7 +193,7 @@
 import cgatcore.iotools as iotools
 import cgatpipelines.tasks.mapping as mapping
 import cgatpipelines.tasks.windows as windows
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.rnaseq as rnaseq
 from cgatcore import pipeline as P
 
@@ -634,12 +634,12 @@ def buildBAMStats(infile, outfile):
     P.run(statement)
 
 
-@P.add_doc(mappingqc.loadBAMStats)
+@P.add_doc(bamstats.loadBAMStats)
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildBAMStats, "bam_stats.load")
 def loadBAMStats(infiles, outfile):
     ''' load bam statistics into bam_stats table '''
-    mappingqc.loadBAMStats(infiles, outfile)
+    bamstats.loadBAMStats(infiles, outfile)
 
 
 @P.add_doc(windows.summarizeTagsWithinContext)
@@ -842,7 +842,7 @@ def buildRefFlat(infile, outfile):
     os.unlink(tmpflat)
 
 
-@P.add_doc(mappingqc.buildPicardRnaSeqMetrics)
+@P.add_doc(bamstats.buildPicardRnaSeqMetrics)
 @transform(mapReadsWithHisat,
            suffix(".bam"),
            add_inputs(buildRefFlat),
@@ -857,16 +857,16 @@ def buildPicardRnaSeqMetrics(infiles, outfile):
     else:
         strand = "NONE"
 
-    mappingqc.buildPicardRnaSeqMetrics(infiles, strand, outfile)
+    bamstats.buildPicardRnaSeqMetrics(infiles, strand, outfile, "9G")
 
 
-@P.add_doc(mappingqc.loadPicardRnaSeqMetrics)
+@P.add_doc(bamstats.loadPicardRnaSeqMetrics)
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildPicardRnaSeqMetrics, ["picard_rna_metrics.load",
                                   "picard_rna_histogram.load"])
 def loadPicardRnaSeqMetrics(infiles, outfiles):
     '''merge alignment stats into single tables.'''
-    mappingqc.loadPicardRnaSeqMetrics(infiles, outfiles)
+    bamstats.loadPicardRnaSeqMetrics(infiles, outfiles)
 
 
 ###################################################################

From cc83e7678484f33c6a12c03954e9f5f2fa80ff5e Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:49:13 +0000
Subject: [PATCH 08/43] remove mappingqc from exome cancer pipeline

---
 cgatpipelines/tools/pipeline_exome_cancer.py  | 24 ++++++++++---------
 .../tools/pipeline_exome_cancer/pipeline.yml  |  2 ++
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome_cancer.py b/cgatpipelines/tools/pipeline_exome_cancer.py
index 3b4dab741..a3edb709c 100644
--- a/cgatpipelines/tools/pipeline_exome_cancer.py
+++ b/cgatpipelines/tools/pipeline_exome_cancer.py
@@ -155,7 +155,7 @@
 import sqlite3
 import cgatcore.iotools as iotools
 import cgatpipelines.tasks.mapping as mapping
-import cgatpipelines.tasks.mappingqc as mappingqc
+import cgatpipelines.tasks.bamstats as bamstats
 from cgatcore import pipeline as P
 import re
 import cgatpipelines.tasks.exome as exome
@@ -187,9 +187,11 @@ def connect():
 
 PARAMS = P.PARAMS
 
+PICARD_MEMORY = PARAMS["picard_memory"]
+
 mapping.PARAMS = PARAMS
-mappingqc.PARAMS = PARAMS
 exome.PARAMS = PARAMS
+
 #########################################################################
 
 
@@ -251,7 +253,7 @@ def mapReads(infile, outfile):
 @merge(mapReads, "picard_duplicate_stats.load")
 def loadPicardDuplicateStats(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
-    mappingqc.loadPicardDuplicateStats(infiles, outfile)
+    bamstats.loadPicardDuplicateStats(infiles, outfile)
 
 
 #########################################################################
@@ -263,7 +265,7 @@ def loadPicardDuplicateStats(infiles, outfile):
 @merge("bam/*.picard_stats", "picard_stats.load")
 def loadPicardAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 #########################################################################
 
@@ -292,8 +294,8 @@ def buildCoverageStats(infile, outfile):
                 '''
     P.run(statement)
 
-    mappingqc.buildPicardCoverageStats(
-        infile, outfile, modified_baits, modified_baits)
+    bamstats.buildPicardCoverageStats(
+        infile, outfile, modified_baits, modified_baits, PICARD_MEMORY)
 
     iotools.zap_file(modified_baits)
 
@@ -301,7 +303,7 @@ def buildCoverageStats(infile, outfile):
 @follows(buildCoverageStats)
 @merge(buildCoverageStats, "coverage_stats.load")
 def loadCoverageStats(infiles, outfile):
-    mappingqc.loadPicardCoverageStats(infiles, outfile)
+    bamstats.loadPicardCoverageStats(infiles, outfile)
 
 #########################################################################
 #########################################################################
@@ -463,16 +465,16 @@ def runPicardOnRealigned(infile, outfile):
     genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                            PARAMS["genome"])
 
-    mappingqc.buildPicardAlignmentStats(infile, outfile, genome)
-    mappingqc.buildPicardAlignmentStats(infile_tumor,
-                                                outfile_tumor, genome)
+    bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY)
+    bamstats.buildPicardAlignmentStats(infile_tumor,
+                                                outfile_tumor, genome, PICARD_MEMORY)
 
 
 @follows(runPicardOnRealigned)
 @merge("bam/*.realigned.picard_stats", "realigned_picard_stats.load")
 def loadPicardRealigenedAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
-    mappingqc.loadPicardAlignmentStats(infiles, outfile)
+    bamstats.loadPicardAlignmentStats(infiles, outfile)
 
 #########################################################################
 #########################################################################
diff --git a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
index 5ea8949bd..ce0761375 100644
--- a/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome_cancer/pipeline.yml
@@ -9,6 +9,8 @@
 # the genome to use
 genome: human_g1k_v37
 
+picard_memory: 9G
+
 web_dir: ../web
 
 report_engine: cgatreport

From 5f63ab32765da464165488e8494ecf65a3c58f4b Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:50:10 +0000
Subject: [PATCH 09/43] remove mappingqc.py - obsolete and superseded by
 bamstats.py

---
 cgatpipelines/tasks/mappingqc.py | 807 -------------------------------
 1 file changed, 807 deletions(-)
 delete mode 100644 cgatpipelines/tasks/mappingqc.py

diff --git a/cgatpipelines/tasks/mappingqc.py b/cgatpipelines/tasks/mappingqc.py
deleted file mode 100644
index a9cb44845..000000000
--- a/cgatpipelines/tasks/mappingqc.py
+++ /dev/null
@@ -1,807 +0,0 @@
-"""
-mappingqc.py - Tasks for QC'ing mapping
-===============================================
-
-Reference
----------
-
-"""
-
-import cgatcore.experiment as E
-import os
-import re
-import pandas
-import cgatcore.iotools as iotools
-import cgat.BamTools.bamtools as BamTools
-from cgatcore import pipeline as P
-
-PICARD_MEMORY = "9G"
-
-
-def getNumReadsFromReadsFile(infile):
-    '''get number of reads from a .nreads file.'''
-    with iotools.open_file(infile) as inf:
-        line = inf.readline()
-        if not line.startswith("nreads"):
-            raise ValueError(
-                "parsing error in file '%s': "
-                "expected first line to start with 'nreads'")
-        nreads = int(line[:-1].split("\t")[1])
-    return nreads
-
-
-def buildPicardInsertSizeStats(infile, outfile, genome_file):
-    '''run Picard:CollectInsertSizeMetrics
-
-    Collect insert size statistics.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectInsertSizeMetrics
-    INPUT=%(infile)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-
-def buildPicardAlignmentStats(infile, outfile, genome_file):
-    '''run picard:CollectMultipleMetrics
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-    tmp_bam = P.get_temp_filename(".")
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    # Picard seems to have problem if quality information is missing
-    # or there is no sequence/quality information within the bam file.
-    # Thus, add it explicitly.
-    statement = '''cat %(infile)s
-    | cgat bam2bam
-    -v 0
-    --method=set-sequence
-    --output-sam
-    --log=%(outfile)s.bam2bam.log
-    > %(tmp_bam)s &&
-    picard %(picard_opts)s CollectMultipleMetrics
-    INPUT=%(tmp_bam)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-    os.unlink(tmp_bam)
-
-def buildPicardDuplicationStats(infile, outfile):
-    '''run picard:MarkDuplicates
-
-    Record duplicate metrics using Picard, the marked records
-    are discarded.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    # currently, MarkDuplicates cannot handle split alignments from gsnap
-    # these can be identified by the custom XT tag.
-    if ".gsnap.bam" in infile:
-        tmpf = P.get_temp_file(".")
-        tmpfile_name = tmpf.name
-        statement = '''samtools view -h %(infile)s
-        | awk "!/\\tXT:/"
-        | samtools view /dev/stdin -S -b > %(tmpfile_name)s;
-        ''' % locals()
-        data_source = tmpfile_name
-    else:
-        statement = ""
-        data_source = infile
-
-    statement += '''picard %(picard_opts)s MarkDuplicates
-    INPUT=%(data_source)s
-    ASSUME_SORTED=true
-    METRICS_FILE=%(outfile)s
-    OUTPUT=/dev/null
-    VALIDATION_STRINGENCY=SILENT
-    '''
-    P.run(statement)
-
-    if ".gsnap.bam" in infile:
-        os.unlink(tmpfile_name)
-
-
-def buildPicardDuplicateStats(infile, outfile):
-    '''run picard:MarkDuplicates
-
-    Record duplicate metrics using Picard and keep the dedupped .bam
-    file.
-
-    Pair duplication is properly handled, including inter-chromosomal
-    cases. SE data is also handled.  These stats also contain a
-    histogram that estimates the return from additional sequecing.  No
-    marked bam files are retained (/dev/null...)  Note that picards
-    counts reads but they are in fact alignments.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-
-    '''
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s MarkDuplicates
-    INPUT=%(infile)s
-    ASSUME_SORTED=true
-    METRICS_FILE=%(outfile)s.duplicate_metrics
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT;
-    '''
-    statement += '''samtools index %(outfile)s ;'''
-    P.run(statement)
-
-
-def buildPicardCoverageStats(infile, outfile, baits, regions):
-    '''run picard:CollectHSMetrics
-
-    Generate coverage statistics for regions of interest from a bed
-    file using Picard.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    baits : :term:`bed` formatted file of bait regions
-    regions : :term:`bed` formatted file of target regions
-
-    '''
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectHsMetrics
-    BAIT_INTERVALS=%(baits)s
-    TARGET_INTERVALS=%(regions)s
-    INPUT=%(infile)s
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=LENIENT''' % locals()
-    P.run(statement)
-
-
-def buildPicardGCStats(infile, outfile, genome_file):
-    """picard:CollectGCBiasMetrics
-
-    Collect GC bias metrics.
-
-    Arguments
-    ---------
-    infile : string
-        Input filename in :term:`BAM` format.
-    outfile : string
-        Output filename with picard output.
-    genome_file : string
-        Filename with genomic sequence.
-
-    """
-
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts) CollectGcBiasMetrics
-    INPUT=%(infile)s
-    REFERENCE_SEQUENCE=%(genome_file)s
-    OUTPUT=%(outfile)s
-    VALIDATION_STRINGENCY=SILENT
-    CHART_OUTPUT=%(outfile)s.pdf
-    SUMMARY_OUTPUT=%(outfile)s.summary
-    >& %(outfile)s'''
-
-    P.run(statement)
-
-
-def loadPicardMetrics(infiles, outfile, suffix,
-                      pipeline_suffix=".picard_stats",
-                      tablename=None):
-    '''load picard metrics.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile.
-    suffix : string
-        Suffix to append to table name.
-    pipeline_suffix : string
-        Suffix to remove from track name.
-    tablename : string
-        Tablename to use. If unset, the table name will be derived
-        from `outfile` and suffix as ``to_table(outfile) + "_" +
-        suffix``.
-
-    '''
-
-    if not tablename:
-        tablename = "%s_%s" % (P.to_table(outfile), suffix)
-
-    outf = P.get_temp_file(".")
-
-    filenames = ["%s.%s" % (x, suffix) for x in infiles]
-
-    first = True
-    for filename in filenames:
-        track = P.snip(os.path.basename(filename), "%s.%s" %
-                       (pipeline_suffix, suffix))
-
-        if not os.path.exists(filename):
-            E.warn("File %s missing" % filename)
-            continue
-
-        lines = iotools.open_file(filename, "r").readlines()
-
-        # extract metrics part
-        rx_start = re.compile("## METRICS CLASS")
-        for n, line in enumerate(lines):
-            if rx_start.search(line):
-                lines = lines[n + 1:]
-                break
-
-        for n, line in enumerate(lines):
-            if not line.strip():
-                lines = lines[:n]
-                break
-
-        if len(lines) == 0:
-            E.warn("no lines in %s: %s" % (track, filename))
-            continue
-
-        if first:
-            outf.write("%s\t%s" % ("track", lines[0]))
-            fields = lines[0][:-1].split("\t")
-        else:
-            f = lines[0][:-1].split("\t")
-            if f != fields:
-                raise ValueError(
-                    "file %s has different fields: expected %s, got %s" %
-                    (filename, fields, f))
-
-        first = False
-        for i in range(1, len(lines)):
-            outf.write("%s\t%s" % (track, lines[i]))
-
-    outf.close()
-
-    P.load(outf.name,
-           outfile,
-           tablename=tablename,
-           options="--add-index=track --allow-empty-file")
-
-    os.unlink(outf.name)
-
-
-def loadPicardHistogram(infiles, outfile, suffix, column,
-                        pipeline_suffix=".picard_stats", tablename=False):
-    '''extract a histogram from a picard output file and load
-    it into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile.
-    suffix : string
-        Suffix to append to table name.
-    column : string
-        Column name to take from the histogram.
-    pipeline_suffix : string
-        Suffix to remove from track name.
-    tablename : string
-        Tablename to use. If unset, the table name will be derived
-        from `outfile` and suffix as ``to_table(outfile) + "_" +
-        suffix``.
-    '''
-
-    if not tablename:
-        tablename = "%s_%s" % (P.to_table(outfile), suffix)
-        tablename = tablename.replace("_metrics", "_histogram")
-
-    # some files might be missing
-    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]
-
-    if len(xfiles) == 0:
-        E.warn("no files for %s" % tablename)
-        return
-
-    header = ",".join([os.path.basename(x) for x in xfiles])
-    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])
-
-    # there might be a variable number of columns in the tables
-    # only take the first ignoring the rest
-
-    load_statement = P.build_load_statement(
-        tablename,
-        options="--add-index=track "
-        " --header-names=%s,%s"
-        " --allow-empty-file"
-        " --replace-header" % (column, header))
-
-    statement = """cgat combine_tables
-    --regex-start="## HISTOGRAM"
-    --missing-value=0
-    --take=2
-    %(filenames)s
-    | %(load_statement)s
-    >> %(outfile)s
-    """
-
-    to_cluster = False
-
-    P.run(statement)
-
-
-def loadPicardAlignmentStats(infiles, outfile):
-    '''load all output from Picard's CollectMultipleMetrics into database.
-
-    Loads tables into database with prefix derived from outfile:
-       * [outfile]_alignment_summary_metric
-       * [outfile]_insert_size_metrics
-       * [outfile]_quality_by_cycle_metrics
-       * [outfile]_quality_distribution_metrics
-       * [outfile]_insert_size_metrics
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-
-    '''
-
-    loadPicardMetrics(infiles, outfile, "alignment_summary_metrics")
-
-    # insert size metrics only available for paired-ended data
-    loadPicardMetrics(infiles, outfile, "insert_size_metrics")
-
-    histograms = (("quality_by_cycle_metrics", "cycle"),
-                  ("quality_distribution_metrics", "quality"),
-                  ("insert_size_metrics", "insert_size"))
-
-    for suffix, column in histograms:
-        loadPicardHistogram(infiles, outfile, suffix, column)
-
-
-def loadPicardDuplicationStats(infiles, outfiles):
-    '''load picard duplicate filtering stats into database.
-
-    Loads two tables into the database
-       * picard_duplication_metrics
-       * picard_complexity_histogram
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-    # SNS: added to enable naming consistency
-
-    outfile_metrics, outfile_histogram = outfiles
-
-    suffix = "picard_duplication_metrics"
-
-    # the loading functions expect "infile_name.pipeline_suffix" as the infile
-    # names.
-    infile_names = [x[:-len("." + suffix)] for x in infiles]
-
-    loadPicardMetrics(infile_names, outfile_metrics, suffix, "",
-                      tablename="picard_duplication_metrics")
-
-    infiles_with_histograms = []
-
-    # The complexity histogram is only present for PE data, so we must check
-    # because by design the pipeline does not track endedness
-    for infile in infile_names:
-        with_hist = False
-        with open(".".join([infile, suffix]), "r") as open_infile:
-            for line in open_infile:
-                if line.startswith("## HISTOGRAM"):
-                    infiles_with_histograms.append(infile)
-                    break
-
-    if len(infiles_with_histograms) > 0:
-        loadPicardHistogram(infiles_with_histograms,
-                            outfile_histogram,
-                            suffix,
-                            "coverage_multiple",
-                            "",
-                            tablename="picard_complexity_histogram")
-    else:
-        with open(outfile_histogram, "w") as ofh:
-            ofh.write("No histograms detected, no data loaded.")
-
-
-def loadPicardDuplicateStats(infiles, outfile, pipeline_suffix=".bam"):
-    '''load picard duplicate filtering stats.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    pipeline_suffix : string
-        Suffix appended to pipeline output file, will be removed to
-        define track.
-    '''
-
-    loadPicardMetrics(
-        infiles, outfile, "duplicate_metrics",
-        pipeline_suffix=pipeline_suffix)
-    loadPicardHistogram(infiles,
-                        outfile,
-                        "duplicate_metrics",
-                        "duplicates",
-                        pipeline_suffix=pipeline_suffix)
-
-
-def loadPicardCoverageStats(infiles, outfile):
-    '''import coverage statistics into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    outf = P.get_temp_file(".")
-    first = True
-    for f in infiles:
-        track = P.snip(os.path.basename(f), ".cov")
-        lines = [x for x in open(f, "r").readlines()
-                 if not x.startswith("#") and x.strip()]
-        if first:
-            outf.write("%s\t%s" % ("track", lines[0]))
-        first = False
-        outf.write("%s\t%s" % (track, lines[1]))
-    outf.close()
-    P.load(outf.name,
-           outfile,
-           options="--ignore-empty --add-index=track")
-    os.unlink(outf.name)
-
-
-def buildBAMStats(infile, outfile):
-    '''Count number of reads mapped, duplicates, etc. using
-    bam2stats.py
-
-    Arguments
-    ---------
-    infile : string
-        Input file in :term:`BAM` format
-    outfile : string
-        Output file in :term:`tsv` format.
-
-    '''
-
-    statement = '''cgat bam2stats
-    --force-output
-    --output-filename-pattern=%(outfile)s.%%s
-    < %(infile)s
-    > %(outfile)s'''
-    P.run(statement)
-
-
-def loadBAMStats(infiles, outfile):
-    '''load output of :func:`buildBAMStats` into database.
-
-    Arguments
-    ---------
-    infiles : string
-        Input files, output from :func:`buildBAMStats`.
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    header = ",".join([P.snip(os.path.basename(x), ".readstats")
-                       for x in infiles])
-    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
-    tablename = P.to_table(outfile)
-
-    load_statement = P.build_load_statement(
-        tablename,
-        options="--add-index=track "
-        " --allow-empty-file")
-
-    E.info("loading bam stats - summary")
-    statement = """cgat combine_tables
-    --header-names=%(header)s
-    --missing-value=0
-    --ignore-empty
-    %(filenames)s
-    | perl -p -e "s/bin/track/"
-    | cgat table2table --transpose
-    | %(load_statement)s
-    > %(outfile)s"""
-
-    to_cluster = False
-
-    P.run(statement)
-
-    for suffix in ("nm", "nh"):
-        E.info("loading bam stats - %s" % suffix)
-        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
-
-        load_statement = P.build_load_statement(
-            "%s_%s" % (tablename, suffix),
-            options="--allow-empty-file")
-
-        statement = """cgat combine_tables
-        --header-names=%(header)s
-        --skip-titles
-        --missing-value=0
-        --ignore-empty
-        %(filenames)s
-        | perl -p -e "s/bin/%(suffix)s/"
-        | %(load_statement)s
-        >> %(outfile)s """
-
-        to_cluster = False
-
-        P.run(statement)
-
-    # load mapping qualities, there are two columns per row
-    # 'all_reads' and 'filtered_reads'
-    # Here, only filtered_reads are used (--take=3)
-    for suffix in ("mapq",):
-        E.info("loading bam stats - %s" % suffix)
-        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])
-
-        load_statement = P.build_load_statement(
-            "%s_%s" % (tablename, suffix),
-            options=" --allow-empty-file")
-
-        statement = """cgat combine_tables
-        --header-names=%(header)s
-        --skip-titles
-        --missing-value=0
-        --ignore-empty
-        --take=3
-        %(filenames)s
-        | perl -p -e "s/bin/%(suffix)s/"
-        | %(load_statement)s
-        >> %(outfile)s """
-
-        to_cluster = False
-
-        P.run(statement)
-
-
-def buildPicardRnaSeqMetrics(infiles, strand, outfile):
-    '''run picard:RNASeqMetrics
-
-
-
-    Arguments
-    ---------
-    infiles : string
-        Input filename in :term:`BAM` format.
-        Genome file in refflat format
-            (http://genome.ucsc.edu/goldenPath/gbdDescriptionsOld.html#RefFlat)
-    outfile : string
-        Output filename with picard output.
-
-    '''
-    job_memory = PICARD_MEMORY
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
-    job_threads = 3
-    infile, genome = infiles
-
-    if BamTools.getNumReads(infile) == 0:
-        E.warn("no reads in %s - no metrics" % infile)
-        iotools.touch_file(outfile)
-        return
-
-    statement = '''picard %(picard_opts)s CollectRnaSeqMetrics
-    REF_FLAT=%(genome)s
-    INPUT=%(infile)s
-    ASSUME_SORTED=true
-    OUTPUT=%(outfile)s
-    STRAND=%(strand)s
-    VALIDATION_STRINGENCY=SILENT
-    '''
-    P.run(statement)
-
-
-def loadPicardRnaSeqMetrics(infiles, outfiles):
-    '''load picard rna stats into database.
-
-    Loads tables into the database
-       * picard_rna_metrics
-       * picard_rna_histogram
-
-    Arguments
-    ---------
-    infile : string
-        Filenames of files with picard metric information. Each file
-        corresponds to a different track.
-    outfiles : string
-        Logfile. The table names will be derived from `outfile`.
-    '''
-
-    outfile_metrics, outfile_histogram = outfiles
-
-    suffix = "picard_rna_metrics"
-
-    # the loading functions expect "infile_name.pipeline_suffix" as the infile
-    # names.
-    infile_names = [x[:-len("." + suffix)] for x in infiles]
-
-    loadPicardMetrics(infile_names, outfile_metrics, suffix, "",
-                      tablename="picard_rna_metrics")
-
-    infiles_with_histograms = []
-
-    # Checking if histogram is present (?is this necessary)
-    for infile in infile_names:
-        with_hist = False
-        with open(".".join([infile, suffix]), "r") as open_infile:
-            for line in open_infile:
-                if line.startswith("## HISTOGRAM"):
-                    infiles_with_histograms.append(infile)
-                    break
-
-    if len(infiles_with_histograms) > 0:
-        loadPicardHistogram(infiles_with_histograms,
-                            outfile_histogram,
-                            suffix,
-                            "coverage_multiple",
-                            "",
-                            tablename="picard_rna_histogram")
-    else:
-        with open(outfile_histogram, "w") as ofh:
-            ofh.write("No histograms detected, no data loaded.")
-
-
-def loadIdxstats(infiles, outfile):
-    '''take list of file paths to samtools idxstats output files
-    and merge to create single dataframe containing mapped reads per
-    contig for each track. This dataframe is then loaded into
-    database.
-
-    Loads tables into the database
-        * idxstats_reads_per_chromosome
-
-    Arguments
-    ---------
-    infiles : list
-        list where each element is a string of the filename containing samtools
-        idxstats output. Filename format is expected to be 'sample.idxstats'
-    outfile : string
-        Logfile. The table name will be derived from `outfile`.
-    '''
-
-    outf = P.get_temp_file(".")
-    dfs = []
-    for f in infiles:
-        track = P.snip(f, ".idxstats").split('/')[-1]
-
-        if not os.path.exists(f):
-            E.warn("File %s missing" % f)
-            continue
-
-        # reformat idx stats
-        df = pandas.read_csv(f, sep='\t', header=None)
-        df.columns = ['region', 'length', 'mapped', 'unmapped']
-
-        # calc total reads mapped & unmappedpep
-        total_reads = df.unmapped.sum() + df.mapped.sum()
-        total_mapped_reads = df.mapped.sum()
-
-        reformatted_df = pandas.DataFrame([['total_mapped_reads', total_mapped_reads],
-                                           ['total_reads', total_reads],
-                                           ['track', track]], columns=(['region', 'mapped']))
-
-        # reformat the df
-        df = df.append(reformatted_df, ignore_index=True)
-        df.set_index('region', inplace=True)
-        df1 = df[['mapped']].T
-        # set track as index
-        df1.set_index('track', inplace=True)
-        dfs.append(df1)
-
-    # merge dataframes into single table
-    master_df = pandas.concat(dfs)
-    master_df.drop('*', axis=1, inplace=True)
-    # transform dataframe to avoid reaching column limit
-    master_df = master_df.T
-    master_df.to_csv(outf, sep='\t', index=True)
-    outf.close()
-
-    P.load(outf.name,
-           outfile,
-           options="--ignore-empty --add-index=track")
-    os.unlink(outf.name)

From 61427c43d864f47439e62a2cebde34e27985f62c Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 00:56:58 +0000
Subject: [PATCH 10/43] fixing errors

---
 cgatpipelines/tools/pipeline_exome.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 1e170c00e..1e25cba7b 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -163,7 +163,7 @@
 # load modules
 from ruffus import transform, mkdir, follows, merge, regex, suffix, \
     jobs_limit, files, collate, add_inputs, formatter, \
-    active_if
+    active_if, originate
 from ruffus.combinatorics import permutations
 import sys
 import os

From d1792d12685faa709b8262651114507dc4bb4ba7 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 01:25:02 +0000
Subject: [PATCH 11/43] troubleshooting

---
 cgatpipelines/tools/pipeline_exome.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 1e25cba7b..162a74a26 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -315,12 +315,13 @@ def loadPicardAlignStats(infiles, outfile):
 
 
 @transform(mapReads, regex(r"bam/(\S+).bam"),
-           add_inputs(r"gatk/\1.merged.bam.count"),
+           add_inputs(createSequenceDictionary),
            r"gatk/\1.readgroups.bam")
-def GATKReadGroups(infile, dictionary, outfile):
+def GATKReadGroups(infiles, outfile):
     '''Reorders BAM according to reference fasta and adds read groups using
     GATK'''
 
+    infile, dictionary = infiles
     track = re.sub(r'-\w+-\w+\.bam', '', os.path.basename(infile))
     tmpdir_gatk = P.get_temp_dir('.')
     job_threads = PARAMS["gatk_threads"]

From 2c3aabfaaaeeba4154ada00155f0531d529e0c0b Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 14 Feb 2022 01:26:01 +0000
Subject: [PATCH 12/43] troubleshooting

---
 cgatpipelines/tasks/exome.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 3b7b61059..6a554d3a2 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -48,11 +48,11 @@ def GATKReadGroups(infile, outfile, dictionary,
     job_threads = 3
 
     statement = '''picard ReorderSam
-                    -INPUT=%(infile)s
-                    -OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
-                    -SEQUENCE_DICTIONARY=%(dictionary)s
-                    -ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
-                    -VALIDATION_STRINGENCY=SILENT ;''' % locals()
+                    -INPUT %(infile)s
+                    -OUTPUT %(tmpdir_gatk)s/%(track)s.reordered.bam
+                    -SEQUENCE_DICTIONARY %(dictionary)s
+                    -ALLOW_INCOMPLETE_DICT_CONCORDANCE true
+                    -VALIDATION_STRINGENCY SILENT ;''' % locals()
 
     statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                  ''' % locals()

From a43395681bc880ae87da868ce297fa22ec1ac3c4 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 25 Apr 2022 14:07:34 +0100
Subject: [PATCH 13/43] Removed realignment step as per GATK4

---
 cgatpipelines/tasks/exome.py          | 27 ------------
 cgatpipelines/tools/pipeline_exome.py | 62 +++++----------------------
 2 files changed, 10 insertions(+), 79 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 6a554d3a2..2cf2a3a18 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -75,33 +75,6 @@ def GATKReadGroups(infile, outfile, dictionary,
 ##############################################################################
 
 
-def GATKIndelRealign(infile, outfile, genome, intervals, padding, threads=4, gatkmem="2G"):
-    '''Realigns BAMs around indels using GATK'''
-
-    intervalfile = outfile.replace(".bam", ".intervals")
-    job_memory = gatkmem 
-    job_threads = 3
-
-    statement = '''GenomeAnalysisTK
-                    -T RealignerTargetCreator
-                    -o %(intervalfile)s
-                    --num_threads %(threads)s
-                    -R %(genome)s
-                    -L %(intervals)s
-                    -ip %(padding)s
-                    -I %(infile)s; ''' % locals()
-
-    statement += '''GenomeAnalysisTK
-                    -T IndelRealigner
-                    -o %(outfile)s
-                    -R %(genome)s
-                    -I %(infile)s
-                    -targetIntervals %(intervalfile)s;''' % locals()
-    P.run(statement)
-
-##############################################################################
-
-
 def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
                   solid_options="", gatkmem="2G"):
     '''Recalibrates base quality scores using GATK'''
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 162a74a26..970a8d053 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -362,22 +362,6 @@ def loadPicardDuplicateStatsLane(infiles, outfile):
 
 @transform(RemoveDuplicatesLane,
            regex(r"gatk/(\S+).dedup.bam"),
-           r"gatk/\1.realigned2.bam")
-def GATKIndelRealignLane(infile, outfile):
-    '''realigns around indels using GATK'''
-    threads = PARAMS["gatk_threads"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    intervals = PARAMS["roi_intervals"]
-    padding = PARAMS["roi_padding"]
-    exome.GATKIndelRealign(infile, outfile, genome, intervals, padding,
-                                   threads, GATK_MEMORY)
-    iotools.zap_file(infile)
-
-###############################################################################
-
-
-@transform(GATKIndelRealignLane,
-           regex(r"gatk/(\S+).realigned2.bam"),
            r"gatk/\1.bqsr.bam")
 def GATKBaseRecal(infile, outfile):
     '''recalibrates base quality scores using GATK'''
@@ -457,37 +441,11 @@ def loadPicardDuplicateStatsSample(infiles, outfile):
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# Realign sample-by-sample
+# Coverage of targetted area
 
 
 @transform(RemoveDuplicatesSample,
            regex(r"gatk/(\S+).dedup2.bam"),
-           add_inputs(r"gatk/\1.merged.bam.count"),
-           r"gatk/\1.realigned.bam")
-def GATKIndelRealignSample(infiles, outfile):
-    '''realigns around indels using GATK'''
-    infile, countfile = infiles
-    threads = PARAMS["gatk_threads"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
-    intervals = PARAMS["roi_intervals"]
-    padding = PARAMS["roi_padding"]
-    countf = open(countfile, "r")
-    if countf.read() > '1':
-        exome.GATKIndelRealign(infiles[0], outfile, genome, intervals,
-                                       padding, threads, GATK_MEMORY)
-    else:
-        shutil.copyfile(infile, outfile)
-        shutil.copyfile(infile + ".bai", outfile + ".bai")
-#    iotools.zap_file(infile)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Coverage of targetted area
-
-
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
            r"gatk/\1.cov")
 def buildCoverageStats(infile, outfile):
     '''Generate coverage statistics for regions of interest from a bed
@@ -513,8 +471,8 @@ def loadCoverageStats(infiles, outfile):
 
 
 @follows(mkdir("xy_ratio"))
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
+@transform(RemoveDuplicatesSample,
+           regex(r"gatk/(\S+).dedup2.bam"),
            r"xy_ratio/\1.sex")
 def calcXYratio(infile, outfile):
     '''Guess the sex of a sample based on ratio of reads
@@ -553,7 +511,7 @@ def loadXYRatio(infile, outfile):
 
 
 @follows(mkdir("variants"))
-@transform(GATKIndelRealignSample, regex(r"gatk/(\S+).realigned.bam"),
+@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"),
            r"variants/\1.haplotypeCaller.g.vcf")
 def haplotypeCaller(infile, outfile):
     '''Call SNVs and indels using GATK HaplotypeCaller in individuals'''
@@ -598,8 +556,8 @@ def SelectExonicHapmapVariants(infile, outfile):
 
 
 @follows(SelectExonicHapmapVariants)
-@transform(GATKIndelRealignSample,
-           regex(r"gatk/(\S+).realigned.bam"),
+@transform(RemoveDuplicatesSample,
+           regex(r"gatk/(\S+).dedup2.bam"),
            add_inputs("hapmap/hapmap_exome.bed"),
            r"hapmap/\1.hapmap.vcf")
 def HapMapGenotype(infiles, outfile):
@@ -733,7 +691,7 @@ def loadTableSnpEff(infile, outfile):
 # GATK Variant Annotator
 
 
-@merge(GATKIndelRealignSample, "gatk/all_samples.list")
+@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
 def listOfBAMs(infiles, outfile):
     '''generates a file containing a list of BAMs for use in VQSR'''
     with iotools.open_file(outfile, "w") as outf:
@@ -999,7 +957,7 @@ def annotateVariantsVEP(infile, outfile):
 
 
 @follows(mkdir("variant_tables"))
-@transform(GATKIndelRealignSample, regex(r"gatk/(.*).realigned.bam"),
+@transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
            add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
@@ -1010,7 +968,7 @@ def makeAnnotationsTables(infiles, outfile):
     bamname = infiles[0]
     inputvcf = infiles[1]
     TF = P.get_temp_filename(".")
-    samplename = bamname.replace(".realigned.bam",
+    samplename = bamname.replace(".dedup2.bam",
                                  ".bam").replace("gatk/", "")
 
     statement = '''bcftools view -h %(inputvcf)s |
@@ -2060,7 +2018,7 @@ def mapping_tasks():
 
 @follows(GATKBaseRecal,
          loadPicardDuplicateStatsLane,
-         GATKIndelRealignSample,
+         RemoveDuplicatesSample,
          loadCoverageStats)
 def gatk():
     pass

From d377b3347d9078213cf0cb6ca09c6bff66120405 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 25 Apr 2022 18:47:47 +0100
Subject: [PATCH 14/43] updated BSQR procedure to gatk4 syntax

---
 cgatpipelines/tasks/exome.py          | 34 ++++++++++++-------------
 cgatpipelines/tools/pipeline_exome.py | 36 +++++++++++++--------------
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 2cf2a3a18..cac90adbe 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -84,19 +84,19 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK
-                    -T BaseRecalibrator
-                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
+    statement = '''gatk
+                    BaseRecalibrator
+                    -O %(tmpdir_gatk)s/%(track)s.recal.grp
                     -R %(genome)s
                     -L %(intervals)s
                     -ip %(padding)s
                     -I %(infile)s
-                    --knownSites %(dbsnp)s %(solid_options)s ;
+                    --known-sites %(dbsnp)s %(solid_options)s ;
                     ''' % locals()
 
-    statement += '''GenomeAnalysisTK
-                    -T PrintReads -o %(outfile)s
-                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
+    statement += '''gatk
+                    ApplyBSQR -O %(outfile)s
+                    --bsqr-recal-file %(tmpdir_gatk)s/%(track)s.recal.grp
                     -R %(genome)s
                     -I %(infile)s ;
                     ''' % locals()
@@ -114,12 +114,12 @@ def haplotypeCaller(infile, outfile, genome,
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK
+    statement = '''gatk
                     -T HaplotypeCaller
                     -ERC GVCF
                     -variant_index_type LINEAR
                     -variant_index_parameter 128000
-                    -o %(outfile)s
+                    -O %(outfile)s
                     -R %(genome)s
                     -I %(infile)s
                     --dbsnp %(dbsnp)s
@@ -137,8 +137,8 @@ def genotypeGVCFs(inputfiles, outfile, genome, options, gatkmem="2G"):
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK
-                    -T GenotypeGVCFs
+    statement = '''gatk
+                    GenotypeGVCFs
                     -o %(outfile)s
                     -R %(genome)s
                     --variant %(inputfiles)s''' % locals()
@@ -226,7 +226,7 @@ def variantAnnotator(vcffile, bamlist, outfile, genome,
     else:
         anno = ""
 
-    statement = '''GenomeAnalysisTK -T VariantAnnotator
+    statement = '''gatk VariantAnnotator
                     -R %(genome)s
                     -I %(bamlist)s
                     -A SnpEff
@@ -249,7 +249,7 @@ def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
 
     track = P.snip(outfile, ".recal")
     if mode == 'SNP':
-        statement = '''GenomeAnalysisTK -T VariantRecalibrator
+        statement = '''gatk VariantRecalibrator
         -R %(genome)s
         -input %(infile)s
         -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s
@@ -265,7 +265,7 @@ def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
         -rscriptFile %(track)s.plots.R ''' % locals()
         P.run(statement)
     elif mode == 'INDEL':
-        statement = '''GenomeAnalysisTK -T VariantRecalibrator
+        statement = '''gatk VariantRecalibrator
         -R %(genome)s
         -input %(infile)s
         -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s
@@ -287,7 +287,7 @@ def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode, gatkm
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK -T ApplyRecalibration
+    statement = '''gatk ApplyRecalibration
     -R %(genome)s
     -input:VCF %(vcf)s
     -recalFile %(recal)s
@@ -305,7 +305,7 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''GenomeAnalysisTK -T VariantsToTable
+    statement = '''gatk VariantsToTable
                    -R %(genome)s
                    -V %(infile)s
                    --showFiltered
@@ -319,7 +319,7 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
 
 def selectVariants(infile, outfile, genome, select):
     '''Filter de novo variants based on provided jexl expression'''
-    statement = '''GenomeAnalysisTK -T SelectVariants
+    statement = '''gatk SelectVariants
                     -R %(genome)s
                     --variant %(infile)s
                     -select '%(select)s'
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 970a8d053..b85a79b58 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -369,7 +369,7 @@ def GATKBaseRecal(infile, outfile):
     outtrack = P.snip(os.path.basename(outfile), ".bam")
     dbsnp = PARAMS["gatk_dbsnp"]
     solid_options = PARAMS["gatk_solid_options"]
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
     if PARAMS["targetted"]:
@@ -530,7 +530,7 @@ def haplotypeCaller(infile, outfile):
 @merge(haplotypeCaller, "variants/all_samples.vcf")
 def genotypeGVCFs(infiles, outfile):
     '''Joint genotyping of all samples together'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     inputfiles = " --variant ".join(infiles)
     options = PARAMS["gatk_genotype_options"]
     exome.genotypeGVCFs(inputfiles, outfile, genome, options)
@@ -563,7 +563,7 @@ def SelectExonicHapmapVariants(infile, outfile):
 def HapMapGenotype(infiles, outfile):
     '''Genotype HapMap SNPs using HaplotypeCaller in each individual'''
     infile, intervals = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     dbsnp = PARAMS["gatk_dbsnp"]
     padding = PARAMS["hapmap_padding"]
     options = PARAMS["hapmap_hc_options"]
@@ -674,7 +674,7 @@ def annotateVariantsSNPeff(infile, outfile):
            r"variants/all_samples.snpeff.table")
 def vcfToTableSnpEff(infile, outfile):
     '''Converts vcf to tab-delimited file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["annotation_snpeff_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -709,7 +709,7 @@ def listOfBAMs(infiles, outfile):
 def variantAnnotator(infiles, outfile):
     '''Annotate variant file using GATK VariantAnnotator'''
     vcffile, bamlist, snpeff_file = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     dbsnp = PARAMS["gatk_dbsnp"]
     annotations = PARAMS["gatk_variant_annotations"]
     exome.variantAnnotator(vcffile, bamlist, outfile, genome,
@@ -783,7 +783,7 @@ def variantRecalibratorIndels(infile, outfile):
 def applyVariantRecalibrationIndels(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     mode = 'INDEL'
     exome.applyVariantRecalibration(vcf, recal, tranches,
                                             outfile, genome, mode, GATK_MEMORY)
@@ -1394,7 +1394,7 @@ def findGenes(infile, outfile):
     geneList = P.as_list(PARAMS["annotation_genes_of_interest"])
     expression = '\'||SNPEFF_GENE_NAME==\''.join(geneList)
     statement = '''GenomeAnalysisTK -T VariantFiltration
-                   -R %%(genome_dir)s/%%(gatkgenome)s.fa
+                   -R %%(genome_dir)s/%%(genome)s.fa
                    --variant %(infile)s
                    --filterExpression "SNPEFF_GENE_NAME=='%(expression)s'"
                    --filterName "GENE_OF_INTEREST" -o %(outfile)s''' % locals()
@@ -1416,7 +1416,7 @@ def findGenes(infile, outfile):
            r"variants/all_samples.snpsift.table")
 def vcfToTable(infile, outfile):
     '''Converts vcf to tab-delimited file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1598,7 +1598,7 @@ def lowerStringencyDeNovos(infiles, outfile):
            r"variants/\1.denovos.table")
 def tabulateLowerStringencyDeNovos(infile, outfile):
     '''Tabulate lower stringency de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1626,7 +1626,7 @@ def loadLowerStringencyDeNovos(infile, outfile):
 def dominantVariants(infiles, outfile):
     '''Filter variants according to autosomal dominant disease model'''
     pedfile, infile = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     pedigree = csv.DictReader(open(pedfile), delimiter='\t',
                               fieldnames=['family', 'sample', 'father',
                                           'mother', 'sex', 'status'])
@@ -1659,7 +1659,7 @@ def dominantVariants(infiles, outfile):
            r"variants/\1.dominant.table")
 def tabulateDoms(infile, outfile):
     '''Tabulate dominant disease candidate variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1688,7 +1688,7 @@ def loadDoms(infile, outfile):
            r"variants/\1.recessive.vcf")
 def recessiveVariants(infiles, outfile):
     '''Filter variants according to autosomal recessive disease model'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     pedfile, infile = infiles
     pedigree = csv.DictReader(open(pedfile), delimiter='\t',
                               fieldnames=['family', 'sample', 'father',
@@ -1732,7 +1732,7 @@ def recessiveVariants(infiles, outfile):
            r"variants/\1.recessive.table")
 def tabulateRecs(infile, outfile):
     '''Tabulate potential homozygous recessive disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1760,7 +1760,7 @@ def loadRecs(infile, outfile):
 def xlinkedVariants(infiles, outfile):
     '''Find maternally inherited X chromosome variants in male patients'''
     track = P.snip(os.path.basename(outfile), ".vcf")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     pedfile, infile = infiles
     pedigree = csv.DictReader(open(pedfile), delimiter='\t',
                               fieldnames=['family', 'sample', 'father',
@@ -1816,7 +1816,7 @@ def xlinkedVariants(infiles, outfile):
            r"variants/\1.xlinked.table")
 def tabulateXs(infile, outfile):
     '''Tabulate potential X-linked disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
     exome.vcfToTable(infile, outfile, genome, columns)
 
@@ -1848,7 +1848,7 @@ def loadXs(infile, outfile):
 def phasing(infiles, outfile):
     '''phase variants with GATK'''
     infile, pedfile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     statement = '''GenomeAnalysisTK -T PhaseByTransmission
                    -R %(genome)s
                    -V %(infile)s
@@ -1867,7 +1867,7 @@ def readbackedphasing(infiles, outfile):
     '''phase variants with ReadBackedPhasing'''
     job_memory = "32G"
     infile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4
                    -R %(genome)s
                    -I %(bamlist)s
@@ -1926,7 +1926,7 @@ def candidateCoverage(infile, outfile):
     all_exons = PARAMS["coverage_all_exons"]
     candidates = PARAMS["coverage_candidates"]
     candidates = candidates.replace(",", " -e ")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["gatkgenome"] + ".fa"
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     threshold = PARAMS["coverage_threshold"]
     statement = '''zcat %(all_exons)s | grep -e %(candidates)s
                    | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - >

From 0b960e6302be8616c95c9ba60bf19eca6414a49b Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 4 May 2022 23:39:04 +0100
Subject: [PATCH 15/43] Gatk4 update: GVCF and GVCF database

---
 cgatpipelines/tasks/exome.py                  | 49 ++++++++++++++++---
 cgatpipelines/tools/pipeline_exome.py         | 20 ++++++--
 .../tools/pipeline_exome/pipeline.yml         |  5 ++
 3 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index cac90adbe..fda9bfd2e 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -95,8 +95,8 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
                     ''' % locals()
 
     statement += '''gatk
-                    ApplyBSQR -O %(outfile)s
-                    --bsqr-recal-file %(tmpdir_gatk)s/%(track)s.recal.grp
+                    ApplyBQSR -O %(outfile)s
+                    --bqsr-recal-file %(tmpdir_gatk)s/%(track)s.recal.grp
                     -R %(genome)s
                     -I %(infile)s ;
                     ''' % locals()
@@ -115,14 +115,12 @@ def haplotypeCaller(infile, outfile, genome,
     job_threads = 3
 
     statement = '''gatk
-                    -T HaplotypeCaller
+                    HaplotypeCaller
                     -ERC GVCF
-                    -variant_index_type LINEAR
-                    -variant_index_parameter 128000
                     -O %(outfile)s
                     -R %(genome)s
                     -I %(infile)s
-                    --dbsnp %(dbsnp)s
+                    --dbsnp %(dbsnp)s 
                     -L %(intervals)s
                     -ip %(padding)s
                     %(options)s''' % locals()
@@ -132,7 +130,44 @@ def haplotypeCaller(infile, outfile, genome,
 ##############################################################################
 
 
-def genotypeGVCFs(inputfiles, outfile, genome, options, gatkmem="2G"):
+def consolidateGVCFs(inputfiles, outfile, inputlen, dbmem="20G"):
+    '''Joint genotyping of all samples together'''
+
+    runmem = dbmem
+    if not re.match(r' ', runmem):
+       runmem  = re.sub(r'([KMGT])', r' \1', runmem)
+    number, unit = [string.strip() for string in runmem.split()]
+    job_threads = 2
+    job_memory = f"{float(number)*1.2/job_threads:.1f}{unit}"
+    tmpdir = P.get_temp_dir('.')
+
+    tmpfile = P.get_temp_filename(".", suffix=".list")
+    chrID = [ 'chr{}'.format(x) for x in list(range(1,23)) + ['X', 'Y'] ]
+    with open(tmpfile, 'w+') as f:
+        for contig in chrID:
+            f.write('%s\n' % contig)
+
+    statement = '''gatk
+                    --java-options "-Xmx%(dbmem)s -Xms%(dbmem)s"
+                    GenomicsDBImport
+                    -V %(inputfiles)s
+                    -L %(tmpfile)s
+                    --reader-threads %(job_threads)s
+                    --tmp-dir %(tmpdir)s
+                    --genomicsdb-workspace-path %(outfile)s
+                    ''' % locals()
+    if inputlen > 50:
+        statement += ''' --batch-size 50'''
+    statement += ''' ;rm -rf %(tmpdir)s ;''' % locals()
+    P.run(statement)
+
+    os.unlink(tmpfile)
+
+
+##############################################################################
+
+
+def genotypeGVCFs(infile, outfile, genome, options, gatkmem="2G"):
     '''Joint genotyping of all samples together'''
     job_memory = gatkmem
     job_threads = 3
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index b85a79b58..bc715da61 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -323,7 +323,6 @@ def GATKReadGroups(infiles, outfile):
 
     infile, dictionary = infiles
     track = re.sub(r'-\w+-\w+\.bam', '', os.path.basename(infile))
-    tmpdir_gatk = P.get_temp_dir('.')
     job_threads = PARAMS["gatk_threads"]
     library = PARAMS["readgroup_library"]
     platform = PARAMS["readgroup_platform"]
@@ -527,11 +526,24 @@ def haplotypeCaller(infile, outfile):
 ###############################################################################
 
 
-@merge(haplotypeCaller, "variants/all_samples.vcf")
-def genotypeGVCFs(infiles, outfile):
+@merge(haplotypeCaller, "variants/genomicsdb")
+def consolidateGVCFs(infiles, outfile):
+    '''generates a GenomicsDB workspace from all GVCF files, this is an
+       easy-access database of all samples developed by the Intel-Broad team.'''
+    inputlen = len(infiles)
+    inputfiles = " -V ".join(infiles)
+    
+    DB_MEMORY  = PARAMS["gatk_dbmem"]
+
+    exome.consolidateGVCFs(inputfiles, outfile, inputlen, DB_MEMORY)
+
+###############################################################################
+
+
+@merge(consolidateGVCFs, "variants/all_samples.vcf")
+def genotypeGVCFs(infile, outfile):
     '''Joint genotyping of all samples together'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    inputfiles = " --variant ".join(infiles)
     options = PARAMS["gatk_genotype_options"]
     exome.genotypeGVCFs(inputfiles, outfile, genome, options)
 
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 64754db4f..174270b7b 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -70,6 +70,11 @@ gatk:
 
     memory: 2G
 
+    # memory required for merging GVCF files
+    # PLEASE NOTE: this is the memory allocated to picard
+    # 1.2x this memory will be requested from the cluster to provide sufficent memory for the database tool.
+    dbmem: 20G
+
     dbsnp: /path/to/dbsnp_138.hg19.vcf
 
     hapmap: /path/to/hapmap_3.3.hg19.sites.vcf

From 52b67d0328e65eba86a82f03f1488a245575e21c Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Fri, 6 May 2022 13:30:09 +0100
Subject: [PATCH 16/43] gatk4: GenotypeGVCF now from database

---
 cgatpipelines/tasks/exome.py          | 14 +++++++++++---
 cgatpipelines/tools/pipeline_exome.py |  9 ++++++---
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index fda9bfd2e..e33cedbdb 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -158,7 +158,9 @@ def consolidateGVCFs(inputfiles, outfile, inputlen, dbmem="20G"):
                     ''' % locals()
     if inputlen > 50:
         statement += ''' --batch-size 50'''
-    statement += ''' ;rm -rf %(tmpdir)s ;''' % locals()
+    statement += ''' > %(outfile)s.log.tmp 2>&1;
+                     rm -rf %(tmpdir)s ;
+                     mv %(outfile)s.log.tmp %(outfile)s.log''' % locals()
     P.run(statement)
 
     os.unlink(tmpfile)
@@ -171,12 +173,18 @@ def genotypeGVCFs(infile, outfile, genome, options, gatkmem="2G"):
     '''Joint genotyping of all samples together'''
     job_memory = gatkmem
     job_threads = 3
+    infile = P.snip(infile, ".log")
+    tmpdir = P.get_temp_dir('.')
+    logfile = P.snip(outfile,".vcf")+".log"
 
     statement = '''gatk
                     GenotypeGVCFs
-                    -o %(outfile)s
+                    -O %(outfile)s
                     -R %(genome)s
-                    --variant %(inputfiles)s''' % locals()
+                    -V gendb://%(infile)s
+                    --tmp-dir %(tmpdir)s
+                    %(options)s > %(logfile)s 2>&1;
+                    rm -rf %(tmpdir)s''' % locals()
     P.run(statement)
 
 ##############################################################################
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index bc715da61..2672d7109 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -526,13 +526,14 @@ def haplotypeCaller(infile, outfile):
 ###############################################################################
 
 
-@merge(haplotypeCaller, "variants/genomicsdb")
+@merge(haplotypeCaller, "variants/genomicsdb.log")
 def consolidateGVCFs(infiles, outfile):
     '''generates a GenomicsDB workspace from all GVCF files, this is an
        easy-access database of all samples developed by the Intel-Broad team.'''
     inputlen = len(infiles)
     inputfiles = " -V ".join(infiles)
     
+    
     DB_MEMORY  = PARAMS["gatk_dbmem"]
 
     exome.consolidateGVCFs(inputfiles, outfile, inputlen, DB_MEMORY)
@@ -540,12 +541,14 @@ def consolidateGVCFs(infiles, outfile):
 ###############################################################################
 
 
-@merge(consolidateGVCFs, "variants/all_samples.vcf")
+@transform(consolidateGVCFs, regex(r"variants/genomicsdb.log"),
+          "variants/all_samples.vcf")
 def genotypeGVCFs(infile, outfile):
     '''Joint genotyping of all samples together'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     options = PARAMS["gatk_genotype_options"]
-    exome.genotypeGVCFs(inputfiles, outfile, genome, options)
+    
+    exome.genotypeGVCFs(infile, outfile, genome, options)
 
 ###############################################################################
 ###############################################################################

From 6be599f4d75d861e3014439ded38fd7c27831c96 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 9 May 2022 11:18:36 +0100
Subject: [PATCH 17/43] added options for rMATS output filtering (results
 table) - FDR, PSI and coverage

---
 cgatpipelines/tools/pipeline_splicing.py | 57 ++++++++++++++++++------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_splicing.py b/cgatpipelines/tools/pipeline_splicing.py
index 1e9a9e0fb..2cb62fb87 100644
--- a/cgatpipelines/tools/pipeline_splicing.py
+++ b/cgatpipelines/tools/pipeline_splicing.py
@@ -789,6 +789,9 @@ def collateMATS(infiles, outfile):
     MATS_fdr : string
        :term:`PARAMS`. User specified threshold for result counting
 
+    MATS_psi: string
+       :term: `PARAMS`. User specified PSI threshold for result counting
+
     outfile: string
         summary file containing number of results below FDR threshold
     '''
@@ -811,20 +814,44 @@ def collateMATS(infiles, outfile):
     for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]:
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (indir, event), sep='\t')
-        total.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
-        up.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)])))
-        down.append(int(len(temp[(temp['FDR'] <
-                                  float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)])))
+        # calculate mean coverage for particular junction for each variable
+        temp['SJC_SUM_1'] = [mean(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [mean(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [mean(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [mean(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = [['SJC_SUM_1','IJC_SUM_1']].max()
+        temp['2_MAX'] = [['SJC_SUM_2','IJC_SUM_2']].max()
+
+        # three conditions to be met:
+        # 1. FDR thrshold
+        # 2. Fixed PSI threshold
+        # 3. minimum coverage in both conditions
+        total.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                  (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                  (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                  (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        up.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                               (temp['IncLevelDifference'] > PARAMS['MATS_psi']) &
+                               (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                               (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        down.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                 (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
+                                 (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                 (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
         temp = pd.read_csv("%s/%s.novelJunc.JC.tsv" %
                            (indir, event), sep='\t')
-        total_newJunc.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
-        up_newJunc.append(int(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] > 0.1)])))
-        down_newJunc.append(int(len(temp[(temp['FDR'] <
-                                  float(PARAMS['MATS_fdr'])) & (temp['IncLevelDifference'] < -0.1)])))
+        total_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                          (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                          (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                          (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        up_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                       (temp['IncLevelDifference'] > PARAMS['MATS_psi']) &
+                                       (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                       (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+        down_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) &
+                                         (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
+                                         (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                         (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
         #experimental feature - deactivated       
         #temp = pd.read_csv("%s/%s.novelSS.JC.tsv" %
         #                   (indir, event), sep='\t')
@@ -949,8 +976,10 @@ def runPermuteMATS(infiles, outfile, design):
     for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]:
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (os.path.dirname(outfile), event), sep='\t')
-        collate.append(str(len(temp[(temp['FDR'] <
-                                float(PARAMS['MATS_fdr'])) & (abs(temp['IncLevelDifference']) > 0.1)])))
+        collate.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+                                  (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
+                                  (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
+                                  (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
     with open(outfile, "w") as f:
         f.write("Group1\tGroup2\tSE\tA5SS\tA3SS\tMXE\tRI\n")
         f.write('\t'.join(collate))

From aaaf87fae749ebabefeef2f15721df5858fd33ef Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 9 May 2022 11:19:41 +0100
Subject: [PATCH 18/43] options for pipeline splicing - yml file update

---
 .../tools/pipeline_splicing/pipeline.yml      | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_splicing/pipeline.yml b/cgatpipelines/tools/pipeline_splicing/pipeline.yml
index 505e2995b..6bf7975fc 100644
--- a/cgatpipelines/tools/pipeline_splicing/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_splicing/pipeline.yml
@@ -44,17 +44,28 @@ annotations:
     dir: ''
 
 
-    ################################################################
-    #
-    # MATS options
-    #
-    ################################################################
+################################################################
+#
+# MATS options
+#
+################################################################
 MATS:
     # Cutoff splicing difference
     # The cutoff used in the null hypothesis test for differential splicing. 
     # The default is 0.0001 for 0.01% difference. Valid: 0 <= cutoff < 1
     cutoff: ''
 
+    # PSI threshold
+    # This allows for setting a fixed minimum PSI change (e.g. 0.1) for filtering
+    # results. This can be used with a more lenient cstat, to increase the number
+    # of results. The threshold is not hypothesis-tested
+    psi: ''
+
+    # Coverage threshold
+    # This allows for setting a fixed minimum number of reads that support this
+    # alternative splicing 
+    coverage: ''
+
     # Strandednes
     # fr-firststrand, fr-secondstrand or fr-unstranded
     libtype: ''
@@ -71,11 +82,11 @@ MATS:
 
 
 
-    ################################################################
-    #
-    # DEXSeq options
-    #
-    ################################################################
+################################################################
+#
+# DEXSeq options
+#
+ ################################################################
 DEXSeq:
 
     # Strand specific library prep

From cb62f8cffc63e322dd56f4449cd3b8f8856af8db Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 9 May 2022 11:22:29 +0100
Subject: [PATCH 19/43] snpeff updated

---
 cgatpipelines/tools/pipeline_exome.py           |  1 -
 cgatpipelines/tools/pipeline_exome/pipeline.yml | 12 ++++++------
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 2672d7109..442cb5021 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -675,7 +675,6 @@ def annotateVariantsSNPeff(infile, outfile):
     config = PARAMS["annotation_snpeff_config"]
     statement = (
         "snpEff -Xmx%(job_memory)s "
-        "eff "
         "-c %(config)s "
         "-v %(snpeff_genome)s "
         "-o gatk %(infile)s > %(outfile)s" % locals())
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 174270b7b..81bb7cd52 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -75,9 +75,9 @@ gatk:
     # 1.2x this memory will be requested from the cluster to provide sufficent memory for the database tool.
     dbmem: 20G
 
-    dbsnp: /path/to/dbsnp_138.hg19.vcf
+    dbsnp: /path/to/dbsnp.vcf
 
-    hapmap: /path/to/hapmap_3.3.hg19.sites.vcf
+    hapmap: /path/to/hapmap.vcf
 
     omni: /path/to/omni/1000G_omni2.5.hg19.sites.vcf
 
@@ -113,18 +113,18 @@ annotation:
     threads: 3
 
     #Config file specifies that the genome must be downloaded into your home directory
-    snpeff_config: /ifs/apps/bio/snpEff-4.3/snpEff.config
+    snpeff_config: path/to/snpEff-VERSION/snpEff.config
 
-    snpeff_genome: hg19
+    snpeff_genome: hg38
 
     # path to dbNSFP database
-    dbnsfp: /ifs/mirror/snpsift/data_2016/dbNFSP.txt.gz
+    dbnsfp: /path/to/dbNFSP.txt.gz
 
     #dbNSFP columns to include
     dbnsfpannotators: [SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred]
 
     # path to clinvar vcf
-    clinvar: /ifs/mirror/clinvar/clinvar_20160531.vcf.gz
+    clinvar: /path/to/clinvar_DATE.vcf.gz
 
     # path to ExAC vcf
     exac: /ifs/mirror/ExAC/sites/ExAC.r0.3.1.sites.vep.vcf.gz

From 75b3ef409af24a646d7b3a2ded3d8015c18c72df Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Tue, 10 May 2022 08:22:38 +0100
Subject: [PATCH 20/43] bug corrections pipeline splicing

---
 cgatpipelines/tools/pipeline_splicing.py | 28 ++++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_splicing.py b/cgatpipelines/tools/pipeline_splicing.py
index 2cb62fb87..4966bd7ce 100644
--- a/cgatpipelines/tools/pipeline_splicing.py
+++ b/cgatpipelines/tools/pipeline_splicing.py
@@ -815,13 +815,12 @@ def collateMATS(infiles, outfile):
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (indir, event), sep='\t')
         # calculate mean coverage for particular junction for each variable
-        temp['SJC_SUM_1'] = [mean(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
-        temp['IJC_SUM_1'] = [mean(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
-        temp['SJC_SUM_2'] = [mean(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
-        temp['SJC_SUM_2'] = [mean(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
-        temp['1_MAX'] = [['SJC_SUM_1','IJC_SUM_1']].max()
-        temp['2_MAX'] = [['SJC_SUM_2','IJC_SUM_2']].max()
-
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
         # three conditions to be met:
         # 1. FDR thrshold
         # 2. Fixed PSI threshold
@@ -838,8 +837,15 @@ def collateMATS(infiles, outfile):
                                  (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
                                  (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
                                  (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+
         temp = pd.read_csv("%s/%s.novelJunc.JC.tsv" %
                            (indir, event), sep='\t')
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
         total_newJunc.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
                                           (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
                                           (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
@@ -852,6 +858,7 @@ def collateMATS(infiles, outfile):
                                          (temp['IncLevelDifference'] < -PARAMS['MATS_psi']) &
                                          (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
                                          (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))
+
         #experimental feature - deactivated       
         #temp = pd.read_csv("%s/%s.novelSS.JC.tsv" %
         #                   (indir, event), sep='\t')
@@ -976,6 +983,13 @@ def runPermuteMATS(infiles, outfile, design):
     for event in ["SE", "A5SS", "A3SS", "MXE", "RI"]:
         temp = pd.read_csv("%s/%s.MATS.JC.txt" %
                            (os.path.dirname(outfile), event), sep='\t')
+        
+        temp['SJC_SUM_1'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_1'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_1.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['SJC_SUM_2'] = [sum(x)/len(x) for x in temp.SJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
+        temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
+        temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
         collate.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
                                   (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
                                   (temp['1_MAX'] >= PARAMS['MATS_coverage']) &

From e406b90e6011b007fcddc4b078915f6240a72751 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 May 2022 00:01:26 +0100
Subject: [PATCH 21/43] further bugfixes splicing pipeline

---
 cgatpipelines/tools/pipeline_splicing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cgatpipelines/tools/pipeline_splicing.py b/cgatpipelines/tools/pipeline_splicing.py
index 4966bd7ce..9bee80f59 100644
--- a/cgatpipelines/tools/pipeline_splicing.py
+++ b/cgatpipelines/tools/pipeline_splicing.py
@@ -990,7 +990,7 @@ def runPermuteMATS(infiles, outfile, design):
         temp['IJC_SUM_2'] = [sum(x)/len(x) for x in temp.IJC_SAMPLE_2.str.split(',').apply(lambda x: [float(i) for i in x])]
         temp['1_MAX'] = temp[['SJC_SUM_1','IJC_SUM_1']].max(axis=1)
         temp['2_MAX'] = temp[['SJC_SUM_2','IJC_SUM_2']].max(axis=1)
-        collate.append(int(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
+        collate.append(str(len(temp[(temp['FDR'] < float(PARAMS['MATS_fdr'])) & 
                                   (abs(temp['IncLevelDifference']) > PARAMS['MATS_psi']) &
                                   (temp['1_MAX'] >= PARAMS['MATS_coverage']) &
                                   (temp['2_MAX'] >= PARAMS['MATS_coverage'])])))

From 37ed54b86d5a790cd2b4987cf5cd12fd51333624 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 May 2022 00:06:52 +0100
Subject: [PATCH 22/43] major changes exome pipeline - rearrangement of
 pipeline - completed gatk best practices - started variant annotation updates

---
 cgatpipelines/tasks/exome.py                  |  87 +++--
 cgatpipelines/tools/pipeline_exome.py         | 322 ++++++++----------
 .../tools/pipeline_exome/pipeline.yml         |  34 +-
 3 files changed, 197 insertions(+), 246 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index e33cedbdb..6789de9cd 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -262,30 +262,21 @@ def variantAnnotator(vcffile, bamlist, outfile, genome,
     job_memory = gatkmem
     job_threads = 3
 
-    if "--useAllAnnotations" in annotations:
-        anno = "--useAllAnnotations"
-    elif annotations:
-        anno = " -A " + " -A ".join(annotations)
-    else:
-        anno = ""
-
     statement = '''gatk VariantAnnotator
                     -R %(genome)s
                     -I %(bamlist)s
-                    -A SnpEff
-                    --snpEffFile %(snpeff_file)s
-                    -o %(outfile)s
-                    --variant %(vcffile)s
-                    -L %(vcffile)s
+                    --
+                    -V %(vcffile)s
                     --dbsnp %(dbsnp)s
-                    %(anno)s''' % locals()
+                    -O %(outfile)s''' % locals()
     P.run(statement)
 
 ##############################################################################
 
 
 def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
-                        kgenomes=None, hapmap=None, omni=None, mills=None, gatkmem = "2G"):
+                        kgenomes=None, hapmap=None, omni=None, mills=None, 
+                        axiom=None, gatkmem = "2G"):
     '''Create variant recalibration file'''
     job_memory = gatkmem
     job_threads = 3
@@ -294,50 +285,55 @@ def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
     if mode == 'SNP':
         statement = '''gatk VariantRecalibrator
         -R %(genome)s
-        -input %(infile)s
-        -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s
-        -resource:omni,known=false,training=true,truth=true,prior=12.0 %(omni)s
-        -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 %(dbsnp)s
-        -resource:1000G,known=false,training=true,truth=false,prior=10.0 %(kgenomes)s
-        -an QD -an SOR -an MQRankSum
-        -an ReadPosRankSum -an FS -an MQ
-        --maxGaussians 4
+        -V %(infile)s
+        --trust-all-polymorphic
+        -tranche 100.0 -tranche 99.95 -tranche 99.9 -tranche 99.8 -tranche 99.6 -tranche 99.5 -tranche 99.4 -tranche 99.3 -tranche 99.0 -tranche 98.0 -tranche 97.0 -tranche 90.0
+        -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ -an SOR -an InbreedingCoeff
         -mode %(mode)s
-        -recalFile %(outfile)s
-        -tranchesFile %(track)s.tranches
-        -rscriptFile %(track)s.plots.R ''' % locals()
+        --max-gaussians 6
+        --resource:hapmap,known=false,training=true,truth=true,prior=15 %(hapmap)s
+        --resource:omni,known=false,training=true,truth=true,prior=12 %(omni)s
+        --resource:1000G,known=false,training=true,truth=false,prior=10 %(kgenomes)s
+        --resource:dbsnp,known=true,training=false,truth=false,prior=7 %(dbsnp)s
+        -O %(track)s.recal
+        --tranches-file %(track)s.tranches
+        --rscript-file %(track)s.plots.R
+        > %(track)s.log 2>&1 ''' % locals()
         P.run(statement)
     elif mode == 'INDEL':
         statement = '''gatk VariantRecalibrator
         -R %(genome)s
-        -input %(infile)s
-        -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s
-        -an QD -an MQRankSum
-        -an ReadPosRankSum -an FS -an MQ
-        --maxGaussians 4
-        --minNumBadVariants 5000
+        -V %(infile)s
+        --resource:mills,known=true,training=true,truth=true,prior=12 %(mills)s
+        --resource:axiomPoly,known=false,training=true,truth=false,prior=10 %(axiom)s
+        --resource:dbsnp,known=true,training=false,truth=false,prior=2 %(dbsnp)s
+        -an QD -an MQRankSum -an ReadPosRankSum -an FS -an MQ
+        --max-gaussians 4
         -mode %(mode)s
-        -recalFile %(outfile)s
-        -tranchesFile %(track)s.tranches
-        -rscriptFile %(track)s.plots.R ''' % locals()
+        -O %(track)s.recal
+        --tranches-file %(track)s.tranches
+        --rscript-file %(track)s.plots.R 
+        > %(track)s.log 2>&1 ''' % locals()
         P.run(statement)
 
 ##############################################################################
 
 
-def applyVariantRecalibration(vcf, recal, tranches, outfile, genome, mode, gatkmem):
+def applyVSQR(vcf, recal, tranches, outfile, genome, mode, gatkmem):
     '''Perform variant quality score recalibration using GATK '''
     job_memory = gatkmem
     job_threads = 3
 
-    statement = '''gatk ApplyRecalibration
+    statement = '''gatk ApplyVQSR
     -R %(genome)s
-    -input:VCF %(vcf)s
-    -recalFile %(recal)s
-    -tranchesFile %(tranches)s
-    --ts_filter_level 99.0
+    -V %(vcf)s
+    --recal-file %(recal)s
+    --tranches-file %(tranches)s
+    --truth-sensitivity-filter-level 99.7
+    --create-output-variant-index true
     -mode %(mode)s
-    -o %(outfile)s ''' % locals()
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1  ''' % locals()
     P.run(statement)
 
 ##############################################################################
@@ -351,17 +347,16 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
     statement = '''gatk VariantsToTable
                    -R %(genome)s
                    -V %(infile)s
-                   --showFiltered
-                   --allowMissingData
+                   --show-filtered
                    %(columns)s
-                   -o %(outfile)s''' % locals()
+                   -O %(outfile)s''' % locals()
     P.run(statement)
 
 ##############################################################################
 
 
 def selectVariants(infile, outfile, genome, select):
-    '''Filter de novo variants based on provided jexl expression'''
+    '''Filter de novo variants based on provided expression'''
     statement = '''gatk SelectVariants
                     -R %(genome)s
                     --variant %(infile)s
@@ -1287,3 +1282,5 @@ def CleanVariantTables(genes, variants, cols, outfile):
     df.columns = ['gene', 'CHROM', 'POS']
     variants = vp1.merge(df, 'left')
     variants.to_csv(outfile, sep="\t")
+
+    
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 442cb5021..04abade57 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -208,7 +208,6 @@
 # The following functions are designed to upload meta-data to the csvdb
 # These haven't been fully implemented yet
 
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["roi_bed"], "roi.load")
 def loadROI(infile, outfile):
@@ -224,9 +223,6 @@ def loadROI(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["roi_to_gene"], "roi2gene.load")
 def loadROI2Gene(infile, outfile):
@@ -240,9 +236,6 @@ def loadROI2Gene(infile, outfile):
             > %(outfile)s  '''
     P.run(statement)
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @files(PARAMS["samples"], "samples.load")
 def loadSamples(infile, outfile):
@@ -267,11 +260,15 @@ def createSequenceDictionary(outfile):
                         -R %(genome_file)s -O %(outfile)s'''
     P.run(statement)
 
+
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# Alignment to a reference genome
+# Pre-Processing for Variant Discovery
 
+###############################################################################
+# Alignment & post-alignment QC & GATK preparation
 
 @follows(mkdir("bam"))
 @transform(INPUT_FORMATS, REGEX_FORMATS, r"bam/\1.bam")
@@ -283,11 +280,6 @@ def mapReads(infiles, outfile):
     statement = m.build((infiles,), outfile)
     P.run(statement, job_memory="8G", job_threads=PARAMS["bwa_threads"])
 
-###############################################################################
-###############################################################################
-###############################################################################
-# Post-alignment QC
-
 
 @transform(mapReads, regex(r"bam/(\S+).bam"), r"bam/\1.picard_stats")
 def PicardAlignStats(infile, outfile):
@@ -295,8 +287,6 @@ def PicardAlignStats(infile, outfile):
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     bamstats.buildPicardAlignmentStats(infile, outfile, genome, PICARD_MEMORY)
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(PicardAlignStats, "picard_stats.load")
@@ -304,16 +294,8 @@ def loadPicardAlignStats(infiles, outfile):
     '''Merge Picard alignment stats into single table and load into SQLite.'''
     bamstats.loadPicardAlignmentStats(infiles, outfile)
 
-###############################################################################
-###############################################################################
-###############################################################################
-# GATK
-
 
 @follows(loadPicardAlignStats, mkdir("gatk"))
-
-
-
 @transform(mapReads, regex(r"bam/(\S+).bam"),
            add_inputs(createSequenceDictionary),
            r"gatk/\1.readgroups.bam")
@@ -333,12 +315,10 @@ def GATKReadGroups(infiles, outfile):
                                  GATK_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Remove duplicates, realign and recalibrate lane-by-lane
 
-
 @transform(GATKReadGroups,
            regex(r"gatk/(\S+).readgroups.bam"),
            r"gatk/\1.dedup.bam")
@@ -347,8 +327,6 @@ def RemoveDuplicatesLane(infile, outfile):
     bamstats.buildPicardDuplicateStats(infile, outfile, PICARD_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(RemoveDuplicatesLane, "picard_duplicate_stats_lane.load")
@@ -356,8 +334,6 @@ def loadPicardDuplicateStatsLane(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
     bamstats.loadPicardDuplicateStats(infiles, outfile)
 
-###############################################################################
-
 
 @transform(RemoveDuplicatesLane,
            regex(r"gatk/(\S+).dedup.bam"),
@@ -380,12 +356,10 @@ def GATKBaseRecal(infile, outfile):
                                     GATK_MEMORY)
     iotools.zap_file(infile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Merge BAMs across different lanes for the same sample
 
-
 @collate(GATKBaseRecal,
          regex(r"gatk/(.*).bqsr.bam"),
          r"gatk/\1.merged.bam")
@@ -406,12 +380,10 @@ def mergeBAMs(infiles, outfile):
     for inputfile in infiles:
         iotools.zap_file(inputfile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Remove duplicates sample-by-sample
 
-
 @transform(mergeBAMs,
            regex(r"gatk/(\S+).merged.bam"),
            add_inputs(r"gatk/\1.merged.bam.count"),
@@ -428,17 +400,13 @@ def RemoveDuplicatesSample(infiles, outfile):
         iotools.zap_file(infile)
 
 
-###############################################################################
-
-
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(RemoveDuplicatesSample, "picard_duplicate_stats_sample.load")
 def loadPicardDuplicateStatsSample(infiles, outfile):
     '''Merge Picard duplicate stats into single table and load into SQLite.'''
     bamstats.loadPicardDuplicateStats(infiles, outfile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Coverage of targetted area
 
@@ -454,8 +422,6 @@ def buildCoverageStats(infile, outfile):
     bamstats.buildPicardCoverageStats(infile, outfile,
                                                baits, regions, PICARD_MEMORY)
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @merge(buildCoverageStats, "coverage_stats.load")
@@ -463,12 +429,10 @@ def loadCoverageStats(infiles, outfile):
     '''Import coverage statistics into SQLite'''
     bamstats.loadPicardCoverageStats(infiles, outfile)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Guess sex
 
-
 @follows(mkdir("xy_ratio"))
 @transform(RemoveDuplicatesSample,
            regex(r"gatk/(\S+).dedup2.bam"),
@@ -478,8 +442,6 @@ def calcXYratio(infile, outfile):
     per megabase of sequence on X and Y'''
     exome.guessSex(infile, outfile)
 
-###############################################################################
-
 
 @merge(calcXYratio, "xy_ratio/xy_ratio.tsv")
 def mergeXYRatio(infiles, outfile):
@@ -493,8 +455,6 @@ def mergeXYRatio(infiles, outfile):
                    > %(outfile)s'''
     P.run(statement)
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @transform(mergeXYRatio, regex(r"xy_ratio/xy_ratio.tsv"),
@@ -503,10 +463,12 @@ def loadXYRatio(infile, outfile):
     '''load into database'''
     P.load(infile, outfile, "--header-names=Track,X,Y,XY_ratio")
 
+
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# Variant Calling
+# GATK germline cohort calling
 
 
 @follows(mkdir("variants"))
@@ -523,8 +485,6 @@ def haplotypeCaller(infile, outfile):
     exome.haplotypeCaller(infile, outfile, genome, dbsnp,
                                   intervals, padding, options, GATK_MEMORY)
 
-###############################################################################
-
 
 @merge(haplotypeCaller, "variants/genomicsdb.log")
 def consolidateGVCFs(infiles, outfile):
@@ -538,8 +498,6 @@ def consolidateGVCFs(infiles, outfile):
 
     exome.consolidateGVCFs(inputfiles, outfile, inputlen, DB_MEMORY)
 
-###############################################################################
-
 
 @transform(consolidateGVCFs, regex(r"variants/genomicsdb.log"),
           "variants/all_samples.vcf")
@@ -550,6 +508,112 @@ def genotypeGVCFs(infile, outfile):
     
     exome.genotypeGVCFs(infile, outfile, genome, options)
 
+
+@transform(genotypeGVCFs,
+           regex(r"variants/all_samples.vcf"),
+           r"variants/all_samples_excesshet.vcf")
+def filterExcessHets(infile, outfile):
+    '''
+    Hard filter on Excess Heterozygotes
+    Should not make a difference on small cohorts unless related
+    54.69 corresponds to a Z score of -4.5
+    '''
+    job_memory = GATK_MEMORY
+    statement = '''
+    gatk --java-options "-Xmx%(job_memory)s -Xms%(job_memory)s" 
+    VariantFiltration
+    -V %(infile)s
+    --filter-expression "ExcessHet > 54.69"
+    --filter-name ExcessHet
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1
+    '''
+    P.run(statement)
+
+
+@transform(filterExcessHets,
+           regex(r"variants/all_samples_excesshet.vcf"),
+           r"variants/all_samples_sitesonly.vcf")
+def sitesOnlyVcf(infile,outfile):
+    '''
+    Create sites-only VCF with MakeSitesOnlyVcf
+    '''
+    job_memory = GATK_MEMORY
+    statement = '''gatk MakeSitesOnlyVcf
+    -I %(infile)s
+    -O %(outfile)s
+    > %(outfile)s.log 2>&1 '''
+    P.run(statement)
+
+
+
+###############################################################################
+# SNP Recalibration
+
+@transform(sitesOnlyVcf,
+           regex(r"variants/all_samples_sitesonly.vcf"),
+           r"variants/all_samples.snp_vqsr.recal")
+def variantRecalibratorSnps(infile, outfile):
+    '''Create variant recalibration file'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    dbsnp = PARAMS["gatk_dbsnp"]
+    job_threads = PARAMS["gatk_threads"]
+    kgenomes = PARAMS["gatk_kgenomes"]
+    hapmap = PARAMS["gatk_hapmap"]
+    omni = PARAMS["gatk_omni"]
+    mode = 'SNP'
+    exome.variantRecalibrator(infile, outfile, genome, mode, dbsnp,
+                                      kgenomes, hapmap, omni, GATK_MEMORY)
+
+
+@follows(variantRecalibratorSnps)
+@transform(filterExcessHets,
+           regex(r"variants/all_samples_excesshet.vcf"),
+           add_inputs(r"variants/all_samples.snp_vqsr.recal",
+                      r"variants/all_samples.snp_vqsr.tranches"),
+           r"variants/all_samples.snp_vqsr.vcf")
+def applyVQSRSnps(infiles, outfile):
+    '''Perform variant quality score recalibration using GATK '''
+    vcf, recal, tranches = infiles
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    mode = 'SNP'
+    exome.applyVQSR(vcf, recal, tranches,
+                                            outfile, genome, mode, GATK_MEMORY)
+
+###############################################################################
+# Indel recalibration
+
+
+@transform(sitesOnlyVcf,
+           regex(r"variants/all_samples_sitesonly.vcf"),
+           r"variants/all_samples.indel_vqsr.recal")
+def variantRecalibratorIndels(infile, outfile):
+    '''Create variant recalibration file'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    job_threads = PARAMS["gatk_threads"]
+    mills = PARAMS["gatk_mills"]
+    dbsnp = PARAMS["gatk_dbsnp"]
+    axiom = PARAMS["gatk_axiom"]
+    mode = 'INDEL'
+    exome.variantRecalibrator(infile, outfile, genome, mode,
+                                      mills=mills, axiom=axiom, dbsnp=dbsnp, gatkmem=GATK_MEMORY)
+
+
+@follows(variantRecalibratorIndels)
+@transform(applyVQSRSnps,
+           regex(r"variants/all_samples.snp_vqsr.vcf"),
+           add_inputs(r"variants/all_samples.indel_vqsr.recal",
+                      r"variants/all_samples.indel_vqsr.tranches"),
+           r"variants/all_samples.vqsr.vcf")
+def applyVQSRIndels(infiles, outfile):
+    '''Perform variant quality score recalibration using GATK '''
+    vcf, recal, tranches = infiles
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    mode = 'INDEL'
+    exome.applyVQSR(vcf, recal, tranches,
+                                            outfile, genome, mode, GATK_MEMORY)
+
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
@@ -567,8 +631,6 @@ def SelectExonicHapmapVariants(infile, outfile):
                    > %(outfile)s''' % locals()
     P.run(statement)
 
-###############################################################################
-
 
 @follows(SelectExonicHapmapVariants)
 @transform(RemoveDuplicatesSample,
@@ -585,8 +647,6 @@ def HapMapGenotype(infiles, outfile):
     exome.haplotypeCaller(infile, outfile, genome, dbsnp,
                                   intervals, padding, options)
 
-###############################################################################
-
 
 @transform(HapMapGenotype, regex(r"hapmap/(\S+).hapmap.vcf"),
            r"hapmap/\1.hapmap.vcf.gz")
@@ -596,12 +656,10 @@ def indexVCFs(infile, outfile):
                    tabix -p vcf %(outfile)s; '''
     P.run(statement)
 
-###############################################################################
-###############################################################################
+
 ###############################################################################
 # Compare Hapmap genotypes to assess relatedness
 
-
 @follows(indexVCFs, mkdir("hapmap/vcfcompare"))
 @permutations("hapmap/*.hapmap.vcf.gz",
               formatter("hapmap/(\S+).hapmap.vcf.gz$"),
@@ -616,8 +674,6 @@ def vcfCompare(infiles, outfile):
                    %(sample1)s %(sample2)s > %(outfile)s'''
     P.run(statement)
 
-###############################################################################
-
 
 @transform(vcfCompare,
            regex(r"hapmap/vcfcompare/(\S+).vcfcompare"),
@@ -630,8 +686,6 @@ def parseVcfCompare(infile, outfile):
                    > %(outfile)s'''
     P.run(statement)
 
-###############################################################################
-
 
 @merge(parseVcfCompare, "hapmap/vcfcompare/ndr.tsv")
 def mergeNDR(infiles, outfile):
@@ -648,8 +702,6 @@ def mergeNDR(infiles, outfile):
         inf.close()
     outf.close()
 
-###############################################################################
-
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
 @transform(mergeNDR, regex(r"hapmap/vcfcompare/ndr.tsv"),
@@ -658,14 +710,17 @@ def loadNDR(infile, outfile):
     '''Load NDR into database'''
     P.load(infile, outfile)
 
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
 # Variant Annotation
 
+###############################################################################
+# SnpSift
 
-@transform(genotypeGVCFs,
-           regex(r"variants/all_samples.vcf"),
+@transform(applyVQSRIndels,
+           regex(r"variants/all_samples.vqsr.vcf"),
            r"variants/all_samples.snpeff.vcf")
 def annotateVariantsSNPeff(infile, outfile):
     '''Annotate variants using SNPeff'''
@@ -677,11 +732,9 @@ def annotateVariantsSNPeff(infile, outfile):
         "snpEff -Xmx%(job_memory)s "
         "-c %(config)s "
         "-v %(snpeff_genome)s "
-        "-o gatk %(infile)s > %(outfile)s" % locals())
+        "%(infile)s > %(outfile)s 2> %(outfile)s.log"  % locals())
     P.run(statement, job_memory=PARAMS["annotation_memory"])
 
-###############################################################################
-
 
 @transform(annotateVariantsSNPeff,
            regex(r"variants/all_samples.snpeff.vcf"),
@@ -690,7 +743,7 @@ def vcfToTableSnpEff(infile, outfile):
     '''Converts vcf to tab-delimited file'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["annotation_snpeff_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
+    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -700,115 +753,12 @@ def loadTableSnpEff(infile, outfile):
     '''Load VCF annotations into database'''
     P.load(infile, outfile, options="--retry --ignore-empty")
 
-
-###############################################################################
-# GATK Variant Annotator
-
-
-@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
-def listOfBAMs(infiles, outfile):
-    '''generates a file containing a list of BAMs for use in VQSR'''
-    with iotools.open_file(outfile, "w") as outf:
-        for infile in infiles:
-            outf.write(infile + '\n')
-
-###############################################################################
-
-
-@follows(annotateVariantsSNPeff)
-@transform(genotypeGVCFs,
-           regex(r"variants/all_samples.vcf"),
-           add_inputs(listOfBAMs, r"variants/all_samples.snpeff.vcf"),
-           r"variants/all_samples.annotated.vcf")
-def variantAnnotator(infiles, outfile):
-    '''Annotate variant file using GATK VariantAnnotator'''
-    vcffile, bamlist, snpeff_file = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    dbsnp = PARAMS["gatk_dbsnp"]
-    annotations = PARAMS["gatk_variant_annotations"]
-    exome.variantAnnotator(vcffile, bamlist, outfile, genome,
-                                   dbsnp, annotations, snpeff_file)
-
-
-###############################################################################
-# SNP Recalibration
-
-
-@transform(variantAnnotator,
-           regex(r"variants/all_samples.annotated.vcf"),
-           r"variants/all_samples.snp_vqsr.recal")
-def variantRecalibratorSnps(infile, outfile):
-    '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    dbsnp = PARAMS["gatk_dbsnp"]
-    job_threads = PARAMS["gatk_threads"]
-    track = P.snip(outfile, ".recal")
-    kgenomes = PARAMS["gatk_kgenomes"]
-    hapmap = PARAMS["gatk_hapmap"]
-    omni = PARAMS["gatk_omni"]
-    mode = 'SNP'
-    exome.variantRecalibrator(infile, outfile, genome, mode, dbsnp,
-                                      kgenomes, hapmap, omni, GATK_MEMORY)
-
-###############################################################################
-
-
-@follows(variantRecalibratorSnps)
-@transform(variantAnnotator,
-           regex(r"variants/all_samples.annotated.vcf"),
-           add_inputs(r"variants/all_samples.snp_vqsr.recal",
-                      r"variants/all_samples.snp_vqsr.tranches"),
-           r"variants/all_samples.snp_vqsr.vcf")
-def applyVariantRecalibrationSnps(infiles, outfile):
-    '''Perform variant quality score recalibration using GATK '''
-    vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    mode = 'SNP'
-    exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode, GATK_MEMORY)
-
-###############################################################################
-# Indel recalibration
-
-
-@transform(applyVariantRecalibrationSnps,
-           regex(r"variants/all_samples.snp_vqsr.vcf"),
-           r"variants/all_samples.vqsr.recal")
-def variantRecalibratorIndels(infile, outfile):
-    '''Create variant recalibration file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    job_threads = PARAMS["gatk_threads"]
-    track = P.snip(outfile, ".recal")
-    mills = PARAMS["gatk_mills"]
-    dbsnp = PARAMS["gatk_dbsnp"]
-    mode = 'INDEL'
-    exome.variantRecalibrator(infile, outfile, genome, mode,
-                                      mills=mills, gatkmem=GATK_MEMORY)
-
-###############################################################################
-
-
-@follows(variantRecalibratorIndels)
-@transform(applyVariantRecalibrationSnps,
-           regex(r"variants/all_samples.snp_vqsr.vcf"),
-           add_inputs(r"variants/all_samples.vqsr.recal",
-                      r"variants/all_samples.vqsr.tranches"),
-           r"variants/all_samples.vqsr.vcf")
-def applyVariantRecalibrationIndels(infiles, outfile):
-    '''Perform variant quality score recalibration using GATK '''
-    vcf, recal, tranches = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    mode = 'INDEL'
-    exome.applyVariantRecalibration(vcf, recal, tranches,
-                                            outfile, genome, mode, GATK_MEMORY)
-
-
 ###############################################################################
 # SnpSift
 
-@transform(applyVariantRecalibrationIndels,
-           regex(r"variants/all_samples.vqsr.vcf"),
-           r"variants/all_samples_dbnsfp.snpsift.vcf")
+@transform(annotateVariantsSNPeff,
+           regex(r"variants/all_samples.snpeff.vcf"),
+           r"variants/all_samples.snpsift.vcf")
 def annotateVariantsDBNSFP(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
@@ -820,7 +770,7 @@ def annotateVariantsDBNSFP(infile, outfile):
     else:
         annostring = "-f %s" % dbN_annotators
 
-    statement = """SnpSift.sh dbnsfp -db %(dbNSFP)s -v %(infile)s
+    statement = """SnpSift.sh dbnsxfp -db %(dbNSFP)s -v %(infile)s
                    %(annostring)s >
                    %(outfile)s;"""
     P.run(statement)
@@ -1432,7 +1382,7 @@ def vcfToTable(infile, outfile):
     '''Converts vcf to tab-delimited file'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
+    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -1569,7 +1519,7 @@ def tabulateDeNovos(infile, outfile):
     '''Tabulate de novo variants'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
+    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -1932,6 +1882,12 @@ def loadCompoundHets(infile, outfile):
 ###############################################################################
 # coverage over candidate genes
 
+@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
+def listOfBAMs(infiles, outfile):
+    '''generates a file containing a list of BAMs for use in VQSR'''
+    with iotools.open_file(outfile, "w") as outf:
+        for infile in infiles:
+            outf.write(infile + '\n')
 
 @active_if(PARAMS["coverage_calculate"] == 1)
 @transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary")
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 81bb7cd52..023634966 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -91,8 +91,6 @@ gatk:
 
     genotype_options: ''
 
-    variant_annotations: [AlleleBalance,AlleleBalanceBySample,AlleleCountBySample,BaseCounts,BaseQualityRankSumTest,ChromosomeCounts,ClippingRankSumTest,Coverage,DepthPerAlleleBySample,FisherStrand,GCContent,GenotypeSummaries,HardyWeinberg,HomopolymerRun,LikelihoodRankSumTest,LowMQ,MappingQualityRankSumTest,MappingQualityZero,MappingQualityZeroBySample,NBaseCount,QualByDepth,RMSMappingQuality,ReadPosRankSumTest,SampleList,SpanningDeletions,StrandBiasBySample,StrandOddsRatio,TandemRepeatAnnotator,VariantType]
-
     vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP
 
 hapmap:
@@ -100,7 +98,7 @@ hapmap:
     loc: /path/to/hapmap/genotype_frequencies
 
     # bgzip compressede and tabix indexed VCF file for Hapmap
-    vcf: /ifs/mirror/genomes/gatk/hapmap_3.3.hg19.sites.vcf
+    vcf: /path/to/hapmap_3.3.hg19.sites.vcf
 
     padding: 0
 
@@ -127,19 +125,19 @@ annotation:
     clinvar: /path/to/clinvar_DATE.vcf.gz
 
     # path to ExAC vcf
-    exac: /ifs/mirror/ExAC/sites/ExAC.r0.3.1.sites.vep.vcf.gz
+    exac: /path/to/ExAC.r0.3.1.sites.vep.vcf.gz
 
     # path to dbsnp vcf
-    dbsnp: /ifs/mirror/dbsnp/All_20160527.vcf.gz
+    dbsnp: /path/to/All_20160527.vcf.gz
 
     # path to chromosome by chromosome 1000g vcfs
-    tgdir: /ifs/mirror/1000_genomes/phase3
+    tgdir: /path/to/1000_genomes/phase3
 
     # path to gwas catalog files
-    gwas_catalog: /ifs/mirror/gwascatalog/gwas_catalog_v1.0-downloaded_2015-07-14.txt
+    gwas_catalog: /path/to/gwas_catalog_v1.0-downloaded_2015-07-14.txt
 
     # path to "phastcons100way" files
-    phastcons: /ifs/mirror/phastcons
+    phastcons: /path/to/phastcons
 
     snpeff_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F EFF
 
@@ -157,10 +155,10 @@ annotation:
     vepannotators: --sift b --variant_class --regulatory --domains
 
     # path to VEP scripts
-    veppath: /ifs/apps/bio/ensembl-vep-80/variant_effect_predictor
+    veppath: /path/to/ensembl-vep-80/variant_effect_predictor
 
     # location of cached annotations
-    vepcache: /ifs/mirror/VEP
+    vepcache: /path/to/VEP
 
     # species name
     vepspecies: homo_sapiens
@@ -215,10 +213,10 @@ filtering:
 
 king:
     #path to king
-    path: /ifs/apps/bio/king
+    path: /path/to/king
 
     #path to plink
-    plink: /ifs/apps/bio/plink-1.07/plink-1.07-x86_64
+    plink: /path/to/plink-1.07/plink-1.07-x86_64
 
 coverage:
 
@@ -226,7 +224,7 @@ coverage:
     calculate: 0
 
     # supply gtf
-    all_exons: /ifs/mirror/annotations/hg19_ensembl75/geneset_coding_exons.gtf.gz
+    all_exons: /path/to/hg19_ensembl75/geneset_coding_exons.gtf.gz
 
     # comma-separated list of canonical ensembl identifiers for candidate genes of interest
     candidates: ''
@@ -242,16 +240,16 @@ roi:
     bed: ''
 
     # Regions of interest
-    roi: /ifs/projects/proj048/agilent/S04380110_Regions.bed
+    roi: /path/to/agilent/S04380110_Regions.bed
 
-    intervals: /ifs/projects/proj048/agilent/S04380110_Regions.interval_list
+    intervals: /path/to/agilent/S04380110_Regions.interval_list
 
     # File mapping regions of interest to genes
-    to_gene: /ifs/projects/proj048/agilent/regions_to_genes.txt
+    to_gene: /path/to/agilent/regions_to_genes.txt
 
-    baits: /ifs/projects/proj048/agilent/S04380110_Probes_5col_samheader_ucsc.bed
+    baits: /path/to/agilent/S04380110_Probes_5col_samheader_ucsc.bed
 
-    regions: /ifs/projects/proj048/agilent/S04380110_Regions_5col_samheader_ucsc.bed
+    regions: /path/to/agilent/S04380110_Regions_5col_samheader_ucsc.bed
 
     padding: 100
 

From 0834b144d42d883a1fc65a10d76cbe42049b3e90 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 May 2022 15:39:18 +0100
Subject: [PATCH 23/43] replace hapmap by somelier

---
 cgatpipelines/tasks/exome.py                  |  20 +-
 cgatpipelines/tools/pipeline_exome.py         | 171 +++++++-----------
 .../tools/pipeline_exome/pipeline.yml         |   4 +-
 3 files changed, 69 insertions(+), 126 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 6789de9cd..538d1bdf5 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -256,24 +256,6 @@ def strelkaINDELCaller(infile_control, infile_tumor, outfile, genome, config,
 ##############################################################################
 
 
-def variantAnnotator(vcffile, bamlist, outfile, genome,
-                     dbsnp, annotations="", snpeff_file="", gatkmem="2G"):
-    '''Annotate variant file using GATK VariantAnnotator'''
-    job_memory = gatkmem
-    job_threads = 3
-
-    statement = '''gatk VariantAnnotator
-                    -R %(genome)s
-                    -I %(bamlist)s
-                    --
-                    -V %(vcffile)s
-                    --dbsnp %(dbsnp)s
-                    -O %(outfile)s''' % locals()
-    P.run(statement)
-
-##############################################################################
-
-
 def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
                         kgenomes=None, hapmap=None, omni=None, mills=None, 
                         axiom=None, gatkmem = "2G"):
@@ -319,7 +301,7 @@ def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
 ##############################################################################
 
 
-def applyVSQR(vcf, recal, tranches, outfile, genome, mode, gatkmem):
+def applyVQSR(vcf, recal, tranches, outfile, genome, mode, gatkmem):
     '''Perform variant quality score recalibration using GATK '''
     job_memory = gatkmem
     job_threads = 3
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 04abade57..abd7a93da 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -18,8 +18,7 @@
    3. Local realignment and BQSR in GATK and deduplication in Picard
    4. Calculate the ratio of reads on the X and Y chromosomes to assert sex
    5. Variant calling in families (SNVs & indels) using GATK HaplotypeCaller
-   6. HapMap genotyping in individuals using GATK HaplotypeCaller
-   7. Comparison of Hapmap genotypes to assess relatedness (VCFtools)
+   7. Comparison of join VCF genotypes to assess relatedness (somalier)
    8. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift
    9. Variant quality score recalibration (GATK)
    10. Flags variants within genes of interest (such as known disease genes)
@@ -152,6 +151,7 @@
 * samtools >= 1.1
 * GATK >= 2.7
 * snpEff >= 4.0
+* somalier
 * Gemini >= ?
 * VCFtools >= 0.1.8a
 
@@ -511,7 +511,7 @@ def genotypeGVCFs(infile, outfile):
 
 @transform(genotypeGVCFs,
            regex(r"variants/all_samples.vcf"),
-           r"variants/all_samples_excesshet.vcf")
+           r"variants/all_samples_excesshet.vcf.gz")
 def filterExcessHets(infile, outfile):
     '''
     Hard filter on Excess Heterozygotes
@@ -532,8 +532,8 @@ def filterExcessHets(infile, outfile):
 
 
 @transform(filterExcessHets,
-           regex(r"variants/all_samples_excesshet.vcf"),
-           r"variants/all_samples_sitesonly.vcf")
+           regex(r"variants/all_samples_excesshet.vcf.gz"),
+           r"variants/all_samples_sitesonly.vcf.gz")
 def sitesOnlyVcf(infile,outfile):
     '''
     Create sites-only VCF with MakeSitesOnlyVcf
@@ -551,7 +551,7 @@ def sitesOnlyVcf(infile,outfile):
 # SNP Recalibration
 
 @transform(sitesOnlyVcf,
-           regex(r"variants/all_samples_sitesonly.vcf"),
+           regex(r"variants/all_samples_sitesonly.vcf.gz"),
            r"variants/all_samples.snp_vqsr.recal")
 def variantRecalibratorSnps(infile, outfile):
     '''Create variant recalibration file'''
@@ -568,10 +568,10 @@ def variantRecalibratorSnps(infile, outfile):
 
 @follows(variantRecalibratorSnps)
 @transform(filterExcessHets,
-           regex(r"variants/all_samples_excesshet.vcf"),
+           regex(r"variants/all_samples_excesshet.vcf.gz"),
            add_inputs(r"variants/all_samples.snp_vqsr.recal",
                       r"variants/all_samples.snp_vqsr.tranches"),
-           r"variants/all_samples.snp_vqsr.vcf")
+           r"variants/all_samples.snp_vqsr.vcf.gz")
 def applyVQSRSnps(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
@@ -585,7 +585,7 @@ def applyVQSRSnps(infiles, outfile):
 
 
 @transform(sitesOnlyVcf,
-           regex(r"variants/all_samples_sitesonly.vcf"),
+           regex(r"variants/all_samples_sitesonly.vcf.gz"),
            r"variants/all_samples.indel_vqsr.recal")
 def variantRecalibratorIndels(infile, outfile):
     '''Create variant recalibration file'''
@@ -601,10 +601,10 @@ def variantRecalibratorIndels(infile, outfile):
 
 @follows(variantRecalibratorIndels)
 @transform(applyVQSRSnps,
-           regex(r"variants/all_samples.snp_vqsr.vcf"),
+           regex(r"variants/all_samples.snp_vqsr.vcf.gz"),
            add_inputs(r"variants/all_samples.indel_vqsr.recal",
                       r"variants/all_samples.indel_vqsr.tranches"),
-           r"variants/all_samples.vqsr.vcf")
+           r"variants/all_samples.vqsr.vcf.gz")
 def applyVQSRIndels(infiles, outfile):
     '''Perform variant quality score recalibration using GATK '''
     vcf, recal, tranches = infiles
@@ -614,130 +614,91 @@ def applyVQSRIndels(infiles, outfile):
                                             outfile, genome, mode, GATK_MEMORY)
 
 
+
 ###############################################################################
 ###############################################################################
 ###############################################################################
-# HapMap Genotypes
+# Variant Annotation
 
 
-@follows(mkdir("hapmap"))
-@files(PARAMS["hapmap_vcf"], "hapmap/hapmap_exome.bed")
-def SelectExonicHapmapVariants(infile, outfile):
-    '''Select variants from HapMap project that are located within
-    exome target regions. Assumes bgzipped & tabix indexed Hapmap VCF file.'''
-    bed = PARAMS["roi_regions"]
-    statement = '''tabix -B %(infile)s %(bed)s |
-                   awk '{OFS="\\t" if (!/^#/){print $1,$2-1,$2}}'
-                   > %(outfile)s''' % locals()
-    P.run(statement)
+###############################################################################
+# Somalier - assessing relatedness of samples
 
+@transform(applyVQSRIndels,
+           regex(r"variants/all_samples.vqsr.vcf.gz"),
+           r"variants/somalier/somalier.log")
+def extractSomalier(infile,outfile):
+    '''extract list of polymorphic sites using Somalier'''
 
-@follows(SelectExonicHapmapVariants)
-@transform(RemoveDuplicatesSample,
-           regex(r"gatk/(\S+).dedup2.bam"),
-           add_inputs("hapmap/hapmap_exome.bed"),
-           r"hapmap/\1.hapmap.vcf")
-def HapMapGenotype(infiles, outfile):
-    '''Genotype HapMap SNPs using HaplotypeCaller in each individual'''
-    infile, intervals = infiles
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    dbsnp = PARAMS["gatk_dbsnp"]
-    padding = PARAMS["hapmap_padding"]
-    options = PARAMS["hapmap_hc_options"]
-    exome.haplotypeCaller(infile, outfile, genome, dbsnp,
-                                  intervals, padding, options)
-
-
-@transform(HapMapGenotype, regex(r"hapmap/(\S+).hapmap.vcf"),
-           r"hapmap/\1.hapmap.vcf.gz")
-def indexVCFs(infile, outfile):
-    '''Genotype HapMap SNPs using HaplotypeCaller in each individual'''
-    statement = '''bgzip -c %(infile)s > %(outfile)s &&
-                   tabix -p vcf %(outfile)s; '''
+    somalier = PARAMS['annotation_somalier_path']
+    sites = PARAMS['annotation_somalier_sites']
+    outdir = os.path.dirname(outfile)
+    statement = '''%(somalier)s extract 
+    -d %(outdir)s
+    --sites %(sites)s
+    -f %(genome)s 
+    %(infile)s
+    > %(outfile)s 2>&1
+    '''
     P.run(statement)
 
 
-###############################################################################
-# Compare Hapmap genotypes to assess relatedness
+@transform(extractSomalier,
+           regex(r"variants/somalier/somalier.log"),
+           r"variants/somalier/relate.log")
+def relateSomalier(infile,outfile):
+    '''Calculate Ancestry based on 1000G project'''
 
-@follows(indexVCFs, mkdir("hapmap/vcfcompare"))
-@permutations("hapmap/*.hapmap.vcf.gz",
-              formatter("hapmap/(\S+).hapmap.vcf.gz$"),
-              2,
-              r"hapmap/vcfcompare/{basename[0][0]}_vs_{basename[1][0]}.vcfcompare")
-def vcfCompare(infiles, outfile):
-    '''Compare HapMap genotypes from each pair of calls '''
-    sample1, sample2 = infiles
-    name1 = P.snip(os.path.basename(sample1), ".hapmap.vcf.gz")
-    name2 = P.snip(os.path.basename(sample2), ".hapmap.vcf.gz")
-    statement = '''vcf-compare -g -m %(name1)s:%(name2)s
-                   %(sample1)s %(sample2)s > %(outfile)s'''
+    somalier = PARAMS['annotation_somalier_path']
+    outdir = os.path.dirname(outfile)
+    statement ='''%(somalier)s relate
+    %(outdir)s/*.somalier
+    > %(outfile)s 2>&1'''
     P.run(statement)
 
 
-@transform(vcfCompare,
-           regex(r"hapmap/vcfcompare/(\S+).vcfcompare"),
-           r"hapmap/vcfcompare/\1.ndr")
-def parseVcfCompare(infile, outfile):
-    '''Extract non-reference discordance rate for each comparison'''
-    statement = '''cat %(infile)s
-                   | grep "Non-reference Discordance Rate (NDR):"
-                   | cut -f 3
-                   > %(outfile)s'''
+@transform(extractSomalier,
+           regex(r"variants/somalier/somalier.log"),
+           r"variants/somalier/ancestry.log")
+def ancestrySomalier(infile,outfile):
+    '''Calculate Ancestry based on 1000G project'''
+
+    somalier = PARAMS['annotation_somalier_path']
+    labels = PARAMS['annotation_somalier_labels']
+    reference = PARAMS['annotation_somalier_1kg']
+    outdir = os.path.dirname(outfile)
+    statement ='''%(somalier)s ancestry 
+    --labels %(labels)s %(reference)s/*.somalier ++ 
+    %(outdir)s/*.somalier
+    > %(outfile)s 2>&1'''
     P.run(statement)
 
 
-@merge(parseVcfCompare, "hapmap/vcfcompare/ndr.tsv")
-def mergeNDR(infiles, outfile):
-    '''Merge NDR values for all files'''
-    outf = open(outfile, "w")
-    outf.write("sample1\tsample2\tNDR\n")
-    for infile in infiles:
-        inbase = P.snip(os.path.basename(infile), ".ndr")
-        sample1, sample2 = inbase.split("_vs_")
-        inf = open(infile, "r")
-        ndr = inf.readline().strip()
-        result = "\t".join([sample1, sample2, ndr])
-        outf.write(result + "\n")
-        inf.close()
-    outf.close()
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(mergeNDR, regex(r"hapmap/vcfcompare/ndr.tsv"),
-           r"hapmap/vcfcompare/ndr.load")
-def loadNDR(infile, outfile):
-    '''Load NDR into database'''
-    P.load(infile, outfile)
-
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Variant Annotation
-
 ###############################################################################
 # SnpSift
 
 @transform(applyVQSRIndels,
-           regex(r"variants/all_samples.vqsr.vcf"),
-           r"variants/all_samples.snpeff.vcf")
+           regex(r"variants/all_samples.vqsr.vcf.gz"),
+           r"variants/all_samples.snpeff.vcf.gz")
 def annotateVariantsSNPeff(infile, outfile):
     '''Annotate variants using SNPeff'''
     job_memory = PARAMS["annotation_memory"]
     job_threads = PARAMS["annotation_threads"]
     snpeff_genome = PARAMS["annotation_snpeff_genome"]
     config = PARAMS["annotation_snpeff_config"]
-    statement = (
-        "snpEff -Xmx%(job_memory)s "
-        "-c %(config)s "
-        "-v %(snpeff_genome)s "
-        "%(infile)s > %(outfile)s 2> %(outfile)s.log"  % locals())
+    outfile = P.snip(outfile,".gz")
+    statement = ''' snpEff -Xmx%(job_memory)s 
+    -c %(config)s 
+    -v %(snpeff_genome)s 
+    %(infile)s > %(outfile)s 2> %(outfile)s.log;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz'''  % locals()
     P.run(statement, job_memory=PARAMS["annotation_memory"])
 
 
 @transform(annotateVariantsSNPeff,
-           regex(r"variants/all_samples.snpeff.vcf"),
+           regex(r"variants/all_samples.snpeff.vcf.gz"),
            r"variants/all_samples.snpeff.table")
 def vcfToTableSnpEff(infile, outfile):
     '''Converts vcf to tab-delimited file'''
@@ -1994,9 +1955,7 @@ def gatk():
     pass
 
 
-@follows(loadXYRatio,
-         indexVCFs,
-         loadNDR)
+@follows(loadXYRatio)
 def sampleFeatures():
     pass
 
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 023634966..dc8a4ef51 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -79,12 +79,14 @@ gatk:
 
     hapmap: /path/to/hapmap.vcf
 
-    omni: /path/to/omni/1000G_omni2.5.hg19.sites.vcf
+    omni: /path/to/1000G_omni2.5.hg19.sites.vcf
 
     kgenomes: /path/to/1000G_phase1.snps.high_confidence.hg19.sites.vcf
 
     mills: /path/to/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
 
+    axiom: /path/to/axioum
+
     solid_options: ''
 
     hc_options: -A StrandOddsRatio -A DepthPerSampleHC

From a3bf625b8fe6f853539aaa42cccce7cd3d142690 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 May 2022 17:24:41 +0100
Subject: [PATCH 24/43] remove xy prediction, which is provided by other tools
 and previous execution not ideal

---
 cgatpipelines/tools/pipeline_exome.py | 40 ---------------------------
 1 file changed, 40 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index abd7a93da..cd6418eaa 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -430,40 +430,6 @@ def loadCoverageStats(infiles, outfile):
     bamstats.loadPicardCoverageStats(infiles, outfile)
 
 
-###############################################################################
-# Guess sex
-
-@follows(mkdir("xy_ratio"))
-@transform(RemoveDuplicatesSample,
-           regex(r"gatk/(\S+).dedup2.bam"),
-           r"xy_ratio/\1.sex")
-def calcXYratio(infile, outfile):
-    '''Guess the sex of a sample based on ratio of reads
-    per megabase of sequence on X and Y'''
-    exome.guessSex(infile, outfile)
-
-
-@merge(calcXYratio, "xy_ratio/xy_ratio.tsv")
-def mergeXYRatio(infiles, outfile):
-    '''merge XY ratios from all samples and load into database'''
-    inlist = " ".join(infiles)
-    statement = '''cgat combine_tables
-                   --add-file-prefix --regex-filename="xy_ratio/(\S+).sex"
-                   --no-titles --missing-value=0 --ignore-empty
-                   -L %(outfile)s.log -v 6
-                   --cat=Track %(inlist)s
-                   > %(outfile)s'''
-    P.run(statement)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(mergeXYRatio, regex(r"xy_ratio/xy_ratio.tsv"),
-           r"xy_ratio/xy_ratio.load")
-def loadXYRatio(infile, outfile):
-    '''load into database'''
-    P.load(infile, outfile, "--header-names=Track,X,Y,XY_ratio")
-
-
 
 ###############################################################################
 ###############################################################################
@@ -1955,11 +1921,6 @@ def gatk():
     pass
 
 
-@follows(loadXYRatio)
-def sampleFeatures():
-    pass
-
-
 @follows(haplotypeCaller,
          genotypeGVCFs)
 def callVariants():
@@ -2020,7 +1981,6 @@ def vcfstats():
 
 @follows(mapping_tasks,
          gatk,
-         # sampleFeatures,
          callVariants,
          annotation,
          filtering,

From 2e56974401aea3a780bd7618ea541b8dc571b2da Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Thu, 12 May 2022 00:02:36 +0100
Subject: [PATCH 25/43] Exome Pipeline Development - fixes for annotations -
 removal of ancestry functions, better covered by somalier - removal of
 remaining sex estimation functions

---
 cgatpipelines/tasks/exome.py                  |  30 +-
 cgatpipelines/tools/pipeline_exome.py         | 334 +++---------------
 .../tools/pipeline_exome/pipeline.yml         |  24 +-
 3 files changed, 78 insertions(+), 310 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 538d1bdf5..98298269d 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -337,6 +337,20 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
 ##############################################################################
 
 
+def snpSift(infile, outfile, vcf, bgzip=True, memory="6G"):
+    '''Annotate VCF with other VCF'''
+    job_memory=memory
+    statement = '''SnpSift annotate %(vcf)s
+    %(infile)s > %(outfile)s 2> %(outfile).log;'''
+    if bgzip:
+        statement +=  '''bgzip %(outfile)s;
+        tabix -p vcf %(outfile)s.gz'''
+    P.run(statement)
+
+
+##############################################################################
+
+
 def selectVariants(infile, outfile, genome, select):
     '''Filter de novo variants based on provided expression'''
     statement = '''gatk SelectVariants
@@ -409,22 +423,6 @@ def buildSelectStatementfromPed(filter_type, pedfile, template):
 
 ##############################################################################
 
-
-def guessSex(infile, outfile):
-    '''Guess the sex of a sample based on ratio of reads
-    per megabase of sequence on X and Y'''
-    statement = '''calc `samtools idxstats %(infile)s
-                    | grep 'X'
-                    | awk '{print $3/($2/1000000)}'`
-                    /`samtools idxstats %(infile)s | grep 'Y'
-                    | awk '{print $3/($2/1000000)}'`
-                    | tr -d " " | tr "=" "\\t" | tr "/" "\\t"
-                    > %(outfile)s'''
-    P.run(statement)
-
-##############################################################################
-
-
 def filterMutect(infile, outfile, logfile,
                  control_id, tumour_id,
                  min_t_alt, min_n_depth,
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index cd6418eaa..1022d605a 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -18,18 +18,18 @@
    3. Local realignment and BQSR in GATK and deduplication in Picard
    4. Calculate the ratio of reads on the X and Y chromosomes to assert sex
    5. Variant calling in families (SNVs & indels) using GATK HaplotypeCaller
-   7. Comparison of join VCF genotypes to assess relatedness (somalier)
-   8. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift
-   9. Variant quality score recalibration (GATK)
-   10. Flags variants within genes of interest (such as known disease genes)
+   6. Comparison of join VCF genotypes to assess relatedness (somalier)
+   7. Variant annotation using SNPeff, GATK VariantAnnotator, and SnpSift
+   8. Variant quality score recalibration (GATK)
+   9. Flags variants within genes of interest (such as known disease genes)
       (GATK) (optional)
-   11. Filters potential de novo variants
-   12. Filters potential de novo variants using lower stringency criteria
-   13. Filters potential dominant mutations
-   14. Filters potential homozygous recessive mutations
-   15. Filters potential compound heterozygous mutations
-   16. Generates summary statistics for unfiltered vcf file
-   17. Generates report
+   10. Filters potential de novo variants
+   11. Filters potential de novo variants using lower stringency criteria
+   12. Filters potential dominant mutations
+   13. Filters potential homozygous recessive mutations
+   14. Filters potential compound heterozygous mutations
+   15. Generates summary statistics for unfiltered vcf file
+   16. Generates report
 
 .. note::
 
@@ -684,88 +684,75 @@ def loadTableSnpEff(infile, outfile):
 # SnpSift
 
 @transform(annotateVariantsSNPeff,
-           regex(r"variants/all_samples.snpeff.vcf"),
-           r"variants/all_samples.snpsift.vcf")
+           regex(r"variants/all_samples.snpeff.vcf.gz"),
+           r"variants/all_samples.snpsift_1.vcf.gz")
 def annotateVariantsDBNSFP(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     dbNSFP = PARAMS["annotation_dbnsfp"]
-    dbN_annotators = PARAMS["annotation_dbnsfpannotators"]
-    if len(dbN_annotators) == 0:
+    dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"]
+    outfile = P.snip(outfile,".gz")
+    if not dbNSFP_columns:
         annostring = ""
     else:
-        annostring = "-f %s" % dbN_annotators
+        annostring = "-f %s" % dbNSFP_columns
 
-    statement = """SnpSift.sh dbnsxfp -db %(dbNSFP)s -v %(infile)s
-                   %(annostring)s >
-                   %(outfile)s;"""
+    statement = '''SnpSift dbnsfp 
+    -db %(dbNSFP)s -v %(infile)s
+    %(annostring)s >
+    %(outfile)s 2> %(outfile)s.log;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz'''
     P.run(statement)
 
 
 @transform(annotateVariantsDBNSFP,
-           regex(r"variants/all_samples_dbnsfp.snpsift.vcf"),
-           r"variants/all_samples_clinvar.snpsift.vcf")
+           regex(r"variants/all_samples.snpsift_1.vcf.gz"),
+           r"variants/all_samples.snpsift_2.vcf.gz")
 def annotateVariantsClinvar(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
     clinvar = PARAMS["annotation_clinvar"]
-    intout = outfile.replace("samples", "samples_clinvar")
-    statement = """SnpSift.sh annotate %(clinvar)s
-                %(infile)s > %(outfile)s;"""
-    P.run(statement)
+    outfile = P.snip(outfile,".gz")
+    exome.snpSift(infile,outfile,clinvar)
 
 
 @transform(annotateVariantsClinvar,
-           regex(r"variants/all_samples_clinvar.snpsift.vcf"),
-           r"variants/all_samples_exac.snpsift.vcf")
-def annotateVariantsExAC(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_memory = "6G"
-    job_threads = PARAMS["annotation_threads"]
-    exac = PARAMS["annotation_exac"]
-    statement = """SnpSift.sh annotate
-                %(exac)s
-                %(infile)s > %(outfile)s;"""
-    P.run(statement)
-
-
-@transform(annotateVariantsExAC,
-           regex(r"variants/all_samples_exac.snpsift.vcf"),
-           r"variants/all_samples_gwasc.snpsift.vcf")
+           regex(r"variants/all_samples_exac.snpsift_2.vcf.gz"),
+           r"variants/all_samples_gwasc.snpsift_3.vcf.gz")
 def annotateVariantsGWASC(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
 
     gwas_catalog = PARAMS["annotation_gwas_catalog"]
-    statement = """SnpSift.sh gwasCat -db %(gwas_catalog)s
+    statement = """SnpSift gwasCat -db %(gwas_catalog)s
                    %(infile)s > %(outfile)s;"""
     P.run(statement)
 
 
 @transform(annotateVariantsGWASC,
-           regex(r"variants/all_samples_gwasc.snpsift.vcf"),
-           r"variants/all_samples_phastcons.snpsift.vcf")
+           regex(r"variants/all_samples.snpsift_3.vcf.gz"),
+           r"variants/all_samples.snpsift_4.vcf.gz")
 def annotateVariantsPhastcons(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
-    genomeind = "%s/%s.fa.fai" % (
-        PARAMS['general_genome_dir'],
-        PARAMS['general_genome'])
+    genome_index = "%s/%s.fa.fai" % (
+        PARAMS['genome_dir'],
+        PARAMS['genome'])
     phastcons = PARAMS["annotation_phastcons"]
     intout = outfile.replace("samples", "samples_phastc")
-    statement = """ln -sf %(genomeind)s %(phastcons)s/genome.fai &&
-                   SnpSift.sh phastCons %(phastcons)s %(infile)s >
+    statement = """ln -sf %(genome_index)s %(phastcons)s/genome.fai &&
+                   SnpSift phastCons %(phastcons)s %(infile)s >
                    %(outfile)s;"""
     P.run(statement)
 
 
 @transform(annotateVariantsPhastcons,
-           regex(r"variants/all_samples_phastcons.snpsift.vcf"),
-           r"variants/all_samples_1000G.snpsift.vcf")
+           regex(r"variants/all_samples.snpsift_4.vcf.gz"),
+           r"variants/all_samples.snpsift_5.vcf.gz")
 def annotateVariants1000G(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
@@ -774,11 +761,7 @@ def annotateVariants1000G(infile, outfile):
     vcfs = []
     for f in os.listdir(PARAMS["annotation_tgdir"]):
         if f.endswith(".vcf.gz"):
-            if PARAMS['test'] == 1:
-                if "chr14" in f:
-                    vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
-            else:
-                vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
+            vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
 
     T = P.get_temp_filename(".")
     shutil.copy(infile, T)
@@ -786,7 +769,7 @@ def annotateVariants1000G(infile, outfile):
     tempout = P.get_temp_filename(".")
 
     for vcf in vcfs:
-        statement = """SnpSift.sh annotate
+        statement = """SnpSift annotate
                        %(vcf)s
                        %(tempin)s > %(tempout)s &&
                        mv %(tempout)s %(tempin)s"""
@@ -796,18 +779,13 @@ def annotateVariants1000G(infile, outfile):
 
 
 @transform(annotateVariants1000G,
-           regex(r"variants/all_samples_1000G.snpsift.vcf"),
-           r"variants/all_samples_dbsnp.snpsift.vcf")
+           regex(r"variants/all_samples.snpsift_5.vcf.gz"),
+           r"variants/all_samples_dbsnp.snpsift_final.vcf.gz")
 def annotateVariantsDBSNP(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
-
     dbsnp = PARAMS["annotation_dbsnp"]
-    statement = """SnpSift.sh annotate
-                %(dbsnp)s
-                %(infile)s > %(outfile)s;"""
-
+    exome.snpSift(infile,outfile,dbsnp)
     P.run(statement)
 
 
@@ -896,229 +874,7 @@ def makeAnnotationsTables(infiles, outfile):
                    -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
                    %(inputvcf)s >> %(outfile)s'''
     P.run(statement)
-
-
-###############################################################################
-# Ancestry Functions #
-HAPMAP = "%s/*txt.gz" % PARAMS['hapmap_loc']
-
-
-@merge(HAPMAP, ["snpdict.json",
-                "randomsnps.tsv"])
-def makeRandomSNPSet(infiles, outfiles):
-    '''
-    Generates a random set of SNPs to use to characterise the ancestry
-    and relatedness of the samples.
-
-    1. Finds all SNPs which have a known genotype frequency for all 11
-       hapmap ancestries
-    2. Picks a random 50000 of these SNPs
-    3. Stores a list of these SNPs as randomsnps.tsv
-    4. Builds a dictionary of dictionaries of dictionaries where:
-           At Level 1 each key is a HapMap ancestry ID
-               each of these keys leads to another dictionary - level 2.
-           At Level 2 each key is a dbSNP SNP ID
-               each of these keys leads to another dictionary - level 3.
-           At Level 3 each key is a genotype:
-               the values of these keys are the genotype frequency of this
-               genotype at this SNP in this ancestry.
-       e.g. snpdict['ASW']['rs000000001']['CT'] -->
-            frequency of the CT genotype at the rs0000001 SNP in the
-            ASW population
-    5.  Stores this dictionary in json format as randomsnps.json
-    '''
-    rs = PARAMS['general_randomseed']
-    exomeancestry.MakeSNPFreqDict(infiles, outfiles, rs, submit=True)
-
-
-@follows(mkdir("sample_genotypes.dir"))
-@transform(makeAnnotationsTables, regex("variant_tables/(.*).tsv"),
-           add_inputs(makeRandomSNPSet), r"sample_genotypes.dir/\1.tsv")
-def getSampleGenotypes(infiles, outfile):
-    '''
-    Fetches the genotype from the variant tables for all samples
-    for SNPs in the hapmap sample from makeRandomSNPSet.
-
-    Complex sites are ignored (as simple SNPs are sufficient for these
-    calculations).
-    These are:
-        Sites which failed QC (column 3 in the variant table is not PASS)
-        Sites with more than 2 alleles defined (column 6 in the variant table
-        contains more than one alternative allele)
-        SNPs with more than one ID
-        Indels
-    '''
-    snps = set([line.strip()
-                for line in iotools.open_file(infiles[1][1]).readlines()])
-    exomeancestry.GenotypeSNPs(infiles[0], snps, outfile, submit=True)
-
-
-@merge(getSampleGenotypes, "calledsnps.tsv")
-def concatenateSNPs(infiles, outfile):
-    '''
-    Lists all the SNPs in the sample from makeRandomSNPSet where a variant has
-    been called in any of the input files.
-    Stores the reference genotype, chromosome and position to use later.
-    '''
-    snps = set()
-    for f in infiles:
-        with iotools.open_file(f) as inp:
-            for line in inp:
-                line = line.strip().split("\t")
-                snps.add("%s\t%s\t%s\t%s" % (line[0], line[4],
-                                             line[1], line[2]))
-    out = iotools.open_file(outfile, "w")
-    for snp in snps:
-        out.write("%s\n" % (snp))
-    out.close()
-
-
-@follows(mkdir("sample_freqs"))
-@transform(getSampleGenotypes, regex("sample_genotypes.dir/(.*).tsv"),
-           add_inputs(concatenateSNPs, makeRandomSNPSet),
-           [r"sample_freqs/\1.tsv", r"sample_freqs/\1_anc.tsv"])
-def calculateAncestry(infiles, outfiles):
-    '''
-    Takes the data stored in MakeRandomSNPSet and the genotype of each sample
-    at each site in calledsnps.tsv and tabulates the frequency of this
-    genotype in each of the HapMap ancestry categories.
-    The overall probability of each ancestry is then calculated as the
-    product of these frequencies. These can only be used in comparison to
-    each other - to show which of the 11 ancestries is most probable.
-    '''
-    calledsnps = infiles[1]
-    snpdict = infiles[2][0]
-    exomeancestry.CalculateAncestry(infiles[0], calledsnps, snpdict,
-                                            outfiles, submit=True)
-
-
-@merge(calculateAncestry, "ancestry_estimate.tsv")
-def mergeAncestry(infiles, outfile):
-    '''
-    Merges the output of the ancestry calculations for each sample into a
-    single table.
-    Draws a plot showing the score (probabilty) for each individual
-    for their assigned ancestry and the second closest match.
-    x = individual
-    y = score
-    large diamonds represent the best match and small triangles the second
-    best match
-    '''
-    out = iotools.open_file(outfile, "w")
-    for f in infiles:
-        infile = f[1]
-        scores = []
-        ancs = []
-        for line in iotools.open_file(infile).readlines():
-            line = line.strip().split("\t")
-            anc = line[0]
-            score = decimal.Decimal(line[1])
-            ancs.append(anc)
-            scores.append(score)
-        z = list(zip(ancs, scores))
-        s = sorted(z, key=lambda x: x[1])[::-1]
-        out.write("%s\t%s\t%s\t%s\t%s\n" % (f[0],
-                                            s[0][0], s[0][1],
-                                            s[1][0], s[1][1]))
-    out.close()
-    exomeancestry.PlotAncestry(outfile)
-
-
-@active_if(len(matches) > 1)
-@merge((calculateAncestry, concatenateSNPs),
-       ["all_samples.ped", "all_samples.map"])
-def makePed(infiles, outfiles):
-    '''
-    Generates the required input for the Plink and King software packages.
-       - PED file - columns are SNPs and rows are samples, each cell is the
-         genotype of the sample at this SNP
-       - MAP file - rows are SNPs in the same order as the columns in the ped
-         file, each row shows chromosome, snp id and position.
-    '''
-    exomeancestry.MakePEDFile(infiles, outfiles)
-
-
-@active_if(len(matches) > 1)
-@transform(makePed, suffix(".ped"), ".ibs0")
-def runPlinkandKing(infiles, outfile):
-    '''
-    Uses King software to calculate relatedness of pairs of samples as
-    described here - http://people.virginia.edu/~wc9c/KING/manual.html
-    Plink is used just to reformat the PED files into the format required by
-    King.
-    '''
-    pref = infiles[0].split(".")[0]
-    k = PARAMS['king_path']
-    p = PARAMS['king_plink']
-    statement = """
-    %(p)s/plink --file %(pref)s --make-bed --no-fid --noweb --map3
-          --no-parents --no-sex --no-pheno &&
-    %(k)s/king -b plink.bed --binary --prefix %(pref)s &&
-    %(k)s/king -b %(pref)s.bgeno --kinship --ibs --prefix %(pref)s"""
-    P.run(statement)
-
-
-@active_if(len(matches) > 1)
-@merge(runPlinkandKing, "family_estimate.tsv")
-def calculateFamily(infile, outfile):
-    '''
-    Translates and filters the output from King.
-    Pairs of related samples are written to the output file along with the
-    degree of relatedness.  Degrees are decided using thresholds from
-    the King documentation, here
-    http://people.virginia.edu/~wc9c/KING/manual.html
-    '''
-    exomeancestry.CalculateFamily(infile, outfile)
-
-
-@active_if(len(matches) > 1)
-@merge(makeAnnotationsTables, "sex_estimate.tsv")
-def calculateSex(infiles, outfile):
-    '''
-    Approximates the sex of the sample using the data in the variant table.
-    Basic estimate based on heterozygosity on the X chromosome - genotypes
-    are taken for all called variants on X passing QC and the percentage
-    of heterozygotes is taken.
-    This tends to produce two clear populations so Kmeans clustering
-    is used to split the data into two - male and female.  Samples which are
-    unclear are marked in the output.
-    '''
-    exomeancestry.CalculateSex(infiles, outfile, submit=True)
-
-
-@merge((mergeAncestry, calculateFamily, calculateSex), "summary.tsv")
-def summarise(infiles, outfile):
-    '''
-    Builds a summary table for ancestry, family and sex.
-    '''
-    ancestry = pd.read_csv(infiles[0], sep="\t", header=None)
-    if len(matches) > 1:
-        family = pd.read_csv(infiles[1], sep="\t", header=None)
-        sex = pd.read_csv(infiles[2], sep="\t", header=None)
-        ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")]
-        family[0] = [a[-1] for a in family[0].str.split("/")]
-        sex[0] = [a[-1] for a in sex[0].str.split("/")]
-        sex = sex.drop([1, 2], 1)
-        sex.columns = ['id', 'sex', 'sex_significance']
-        family.columns = ['id', 'related_to', 'degree_relatedness']
-        family['degree_relatedness'] = family['degree_relatedness'].astype(int)
-        ancestry = ancestry.drop([2, 3, 4], 1)
-        ancestry.columns = ['id', 'ancestry']
-        summary = ancestry.merge(sex).merge(family, 'left')
-        summary = summary.fillna("NA")
-        summary.to_csv(outfile, sep="\t")
-    else:
-        ancestry[0] = [a[-1] for a in ancestry[0].str.split("/")]
-        ancestry = ancestry.drop([2, 3, 4], 1)
-        ancestry.columns = ['id', 'ancestry']
-        ancestry = ancestry.fillna("NA")
-        ancestry.to_csv(outfile, sep="\t")
-
-
-@follows(summarise)
-def ancestry():
-    pass
-
+        
 
 ###############################################################################
 # Quality Filtering #
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index dc8a4ef51..03d967266 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -120,15 +120,29 @@ annotation:
     # path to dbNSFP database
     dbnsfp: /path/to/dbNFSP.txt.gz
 
-    #dbNSFP columns to include
-    dbnsfpannotators: [SIFT_pred,Polyphen2_HDIV_pred,Polyphen2_HVAR_pred,MutationTaster_pred]
+    # dbNSFP columns to include
+    # Default (if left empty): Uniprot_acc, Interpro_domain, 
+    # SIFT_pred, Polyphen2_HDIV_pred, Polyphen2_HVAR_pred, 
+    # LRT_pred, MutationTaster_pred, GERP++_NR, GERP++_RS, 
+    # phastCons100way_vertebrate, 1000Gp1_AF, 1000Gp1_AFR_AF, 
+    # 1000Gp1_EUR_AF, 1000Gp1_AMR_AF, 1000Gp1_ASN_AF, 
+    # ESP6500_AA_AF, ESP6500_EA_AF, MutationTaster_pred, 
+    # MutationAssessor_pred, FATHMM_pred, PROVEAN_pred, 
+    # CADD_phred, MetaSVM_pred, 1000Gp3_AC, 1000Gp3_AF, 
+    # 1000Gp3_AFR_AC, 1000Gp3_AFR_AF, 1000Gp3_EUR_AC, 
+    # 1000Gp3_EUR_AF, 1000Gp3_AMR_AC, 1000Gp3_AMR_AF, 
+    # 1000Gp3_EAS_AC, 1000Gp3_EAS_AF, 1000Gp3_SAS_AC, 
+    # 1000Gp3_SAS_AF, ESP6500_AA_AC, ESP6500_AA_AF, 
+    # ESP6500_EA_AC, ESP6500_EA_AF, ExAC_AC, ExAC_AF, 
+    # ExAC_Adj_AC, ExAC_Adj_AF, ExAC_AFR_AC, ExAC_AFR_AF, 
+    # ExAC_AMR_AC, ExAC_AMR_AF, ExAC_EAS_AC, 
+    # ExAC_EAS_AF, ExAC_FIN_AC, ExAC_FIN_AF, ExAC_NFE_AC, 
+    # ExAC_NFE_AF, ExAC_SAS_AC, ExAC_SAS_AF
+    dbnsfp_columns:  
 
     # path to clinvar vcf
     clinvar: /path/to/clinvar_DATE.vcf.gz
 
-    # path to ExAC vcf
-    exac: /path/to/ExAC.r0.3.1.sites.vep.vcf.gz
-
     # path to dbsnp vcf
     dbsnp: /path/to/All_20160527.vcf.gz
 

From 414358348395b590bcda4ce0861d59cf70bcb3b9 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Thu, 12 May 2022 06:34:26 +0100
Subject: [PATCH 26/43] bugfixes exome pipeline:snpsift and revomal
 exomeancestry.py

---
 cgatpipelines/tasks/exome.py          |   2 +-
 cgatpipelines/tasks/exomeancestry.py  | 487 --------------------------
 cgatpipelines/tools/pipeline_exome.py |  35 +-
 3 files changed, 22 insertions(+), 502 deletions(-)
 delete mode 100644 cgatpipelines/tasks/exomeancestry.py

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 98298269d..0afbefc89 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -341,7 +341,7 @@ def snpSift(infile, outfile, vcf, bgzip=True, memory="6G"):
     '''Annotate VCF with other VCF'''
     job_memory=memory
     statement = '''SnpSift annotate %(vcf)s
-    %(infile)s > %(outfile)s 2> %(outfile).log;'''
+    %(infile)s > %(outfile)s 2> %(outfile)s.log;'''
     if bgzip:
         statement +=  '''bgzip %(outfile)s;
         tabix -p vcf %(outfile)s.gz'''
diff --git a/cgatpipelines/tasks/exomeancestry.py b/cgatpipelines/tasks/exomeancestry.py
deleted file mode 100644
index 7618bab10..000000000
--- a/cgatpipelines/tasks/exomeancestry.py
+++ /dev/null
@@ -1,487 +0,0 @@
-import pandas as pd
-import numpy as np
-import cgatcore.iotools as iotools
-import random
-import itertools
-from cgatcore.pipeline import cluster_runnable
-import json
-import decimal
-from sklearn.cluster import KMeans
-import matplotlib.pyplot as plt
-import matplotlib.cm as cm
-import matplotlib.patches as mpatches
-
-
-@cluster_runnable
-def MakeSNPFreqDict(infiles, outfiles, rs):
-    '''
-    Generates a random set of SNPs to use to characterise the ancestry
-    and relatedness of the samples.
-
-    1. Finds all SNPs which have a known genotype frequency for all 11
-       hapmap ancestries
-    2. Picks a random 50000 of these SNPs
-    3. Stores a list of these SNPs as randomsnps.tsv
-    4. Builds a dictionary of dictionaries of dictionaries where:
-           At Level 1 each key is a HapMap ancestry ID
-               each of these keys leads to another dictionary - level 2.
-           At Level 2 each key is a dbSNP SNP ID
-               each of these keys leads to another dictionary - level 3.
-           At Level 3 each key is a genotype:
-               the values of these keys are the genotype frequency of this
-               genotype at this SNP in this ancestry.
-       e.g. snpdict['ASW']['rs000000001']['CT'] -->
-            frequency of the CT genotype at the rs0000001 SNP in the
-            ASW population
-    5.  Stores this dictionary in json format as randomsnps.json
-    '''
-
-    # list all chromosomes
-    chroms = np.array([f.split("/")[-1].split("_")[2] for f in infiles])
-    uchroms = set(chroms)
-    infiles = np.array(infiles)
-    snpdict = dict()
-    for chrom in uchroms:
-        # get all the files for this chromosome
-        c = np.where(chroms == chrom)[0]
-        thischrom = infiles[c]
-        snpidsets = []
-        # make a set of all the snp ids of known frequency for
-        # this chromsome for each ancestry
-        for f in thischrom:
-            snpids = []
-            anc = f.split("/")[-1].split("_")[3]
-            with iotools.open_file(f) as inp:
-                for line in inp:
-                    line = line.strip().split(" ")
-                    snpids.append(line[0])
-            snpids = set(snpids)
-            snpidsets.append(snpids)
-        # find the snp ids where frequency is known in all ancestries
-        thischromsnps = set.intersection(*snpidsets)
-        snpdict[chrom] = thischromsnps
-
-    vals = list(snpdict.values())
-    # set of all the SNPs genotyped in all ancestries on all chroms
-    pooled = set.union(*vals)
-    # take a random sample of snps from this set
-    random.seed(rs)
-    sam = set(random.sample(pooled, 50000))
-
-    # make a new dictionary of just the sampled snps for each chrom
-    snpdictsam = dict()
-    for key in snpdict:
-        i = snpdict[key] & sam
-        snpdictsam[key] = i
-
-    # generate all possible genotypes as list of strings
-    # (e.g. CC CT CG CA TT etc)
-    res = ["%s%s" % (g[0], g[1])
-           for g in itertools.permutations("CGAT", 2)]
-    res += ['CC', 'GG', 'TT', 'AA']
-    freqdict = dict()
-    ancs = set([l.split("/")[-1].split("_")[3] for l in infiles])
-
-    # build an empty dictionary of dictionaries of dictionaries
-    # to store genotype freqs for each ancestry
-    for anc in ancs:
-        freqdict.setdefault(anc, dict())
-        for s in sam:
-            freqdict[anc].setdefault(s, dict())
-            for r in res:
-                freqdict[anc][s][r] = 0.0
-
-    # read genotype freqs from the input file and store in the dictionary
-    for f in infiles:
-        bits = f.split("/")[-1].split("_")
-        anc = bits[3]
-        chrom = bits[2]
-        snpset = snpdictsam[chrom]
-        with iotools.open_file(f) as input:
-            for line in input:
-                line = line.strip().split(" ")
-                if line[0] in snpset:
-                    G1 = "%s%s" % (line[10][0], line[10][-1])
-                    try:
-                        F1 = float(line[11])
-                        G2 = "%s%s" % (line[13][0], line[13][-1])
-                        F2 = float(line[14])
-                        G3 = "%s%s" % (line[16][0], line[16][-1])
-                        F3 = float(line[17])
-                    except:
-                        # because occasionally the GFs are not numeric
-                        continue
-                    freqdict[anc][line[0]][G1] = F1
-                    freqdict[anc][line[0]][G2] = F2
-                    freqdict[anc][line[0]][G3] = F3
-
-    # dump dictionary in a json file to resurrect later
-    outf = iotools.open_file(outfiles[0], "w")
-    json.dump(freqdict, outf)
-    outf.close()
-
-    # store the sampled list of snps
-    out = iotools.open_file(outfiles[1], "w")
-    for snp in sam:
-        out.write("%s\n" % snp)
-    out.close()
-
-
-@cluster_runnable
-def GenotypeSNPs(infile, snplist, outfile):
-    '''
-    Fetches the genotype from the variant tables for all samples
-    for SNPs in the hapmap sample from makeRandomSNPSet.
-
-    Complex sites are ignored (as simple SNPs are sufficient for these
-    calculations).
-    These are:
-        Sites which failed QC (column 3 in the variant table is not PASS)
-        Sites with more than 2 alleles defined (column 6 in the variant table
-        contains more than one alternative allele)
-        SNPs with more than one ID
-        Indels
-    '''
-    out = iotools.open_file(outfile, "w")
-    with iotools.open_file(infile) as inf:
-        for line in inf:
-            line = line.strip().split()
-            # if the variant passed QC
-            if line[4] == "PASS":
-                genotype = line[7]
-                # if the genotype looks normal e.g. 1/1
-                if len(genotype) == 3:
-                    # get the actual genotype (rather than the index)
-                    if genotype[0] != ".":
-                        ind1 = int(genotype[0])
-                    else:
-                        ind1 = 0
-                    if genotype[2] != ".":
-                        ind2 = int(genotype[2])
-                    else:
-                        ind2 = 0
-                    A1 = line[5]
-                    A2 = line[6].split(",")
-                    AS = [A1] + A2
-
-                    if len(AS) <= 2:
-                        GT = "%s%s" % (AS[ind1], AS[ind2])
-                        refGT = "%s%s" % (A1, A1)
-                        if len(GT) == 2:
-                            if line[3][0:2] == "rs" and len(
-                                    line[3].split(";")) == 1:
-                                snpid = line[3]
-                                chrom = line[0]
-                                pos = line[1]
-                                if snpid in snplist:
-                                    out.write("%s\t%s\t%s\t%s\t%s\n"
-                                              % (snpid, chrom, pos, GT,
-                                                 refGT))
-    out.close()
-
-
-@cluster_runnable
-def CalculateAncestry(infile, calledsnps, snpdict, outfiles):
-    '''
-    Takes the data stored in MakeRandomSNPSet and the genotype of each sample
-    at each site in calledsnps.tsv and tabulates the frequency of this
-    genotype in each of the HapMap ancestry categories.
-    The overall probability of each ancestry is then calculated as the
-    product of these frequencies. These can only be used in comparison to
-    each other - to show which of the 11 ancestries is most probable.
-    '''
-
-    # List the SNPs in the sample where a variant has been called
-    # Record the reference genotype at each of these SNPs
-    called = set()
-    refs = dict()
-    for line in iotools.open_file(calledsnps).readlines():
-        line = line.strip().split("\t")
-        refs[line[0]] = line[1]
-        called.add(line[0])
-
-    # Record the genotype of the current sample at each SNP
-    currents = dict()
-    for line in iotools.open_file(infile).readlines():
-        line = line.strip().split("\t")
-        currents[line[0]] = line[3]
-
-    # Regenerate the genotype frequency dictionary made earlier
-    s = iotools.open_file(snpdict)
-    snpdict = json.load(s)
-    s.close()
-    ancs = list(snpdict.keys())
-
-    # Build a table of the frequency of the genotype in this sample
-    # in each of the hapmap ancestries
-    out = iotools.open_file(outfiles[0], "w")
-    out.write("snp\tgenotype\t%s\n" % "\t".join(ancs))
-    for snp in called:
-        # where a variant hasn't been called in the current sample assume
-        # the reference genotype
-        if snp in currents:
-            geno = currents[snp]
-        else:
-            geno = refs[snp]
-
-        res = []
-        for anc in ancs:
-            D = snpdict[anc]
-            sub = D[snp]
-            freq = sub[geno]
-            res.append(freq)
-
-        res = [str(r) for r in res]
-        out.write("%s\t%s\t%s\n" % (snp, geno, "\t".join(res)))
-
-    out.close()
-    out = iotools.open_file(outfiles[1], "w")
-
-    # read the output of the previous step into pandas
-    res = pd.read_csv(outfiles[0], sep="\t")
-
-    # calculate the probability of each ancestry as the product of all the
-    # genotype frequencies
-    # SNPs where a genotype not recorded in hapmap has been called are
-    # skipped
-    # decimal package is used because the results are very small floats
-    res = res.replace(0, float('nan'))
-    for anc in set(ancs):
-        r = res[anc][np.invert(res[anc].isnull())]
-        r = [decimal.Decimal(d) for d in r]
-        out.write("%s\t%s\n" % (anc, np.product(r)))
-    out.close()
-
-
-@cluster_runnable
-def MakePEDFile(infiles, outfiles):
-    '''
-    Generates the required input for the Plink and King software packages.
-       - PED file - columns are SNPs and rows are samples, each cell is the
-         genotype of the sample at this SNP
-       - MAP file - rows are SNPs in the same order as the columns in the ped
-         file, each row shows chromosome, snp id and position.
-    '''
-    # Record the chromosome and position of each SNP
-    chromposdict = dict()
-    for line in iotools.open_file(infiles[-1]).readlines():
-        line = line.strip().split("\t")
-        chromposdict[line[0]] = ((line[2], line[3]))
-
-    # Generate the PED file - takes the genotype column of the table from
-    # the CalculateAncestry step and transposes it across a row
-
-    out = iotools.open_file(outfiles[0], "w")
-    for inf in infiles[0:-1]:
-        inf = inf[0]
-        df = pd.read_csv(inf, sep="\t")
-        geno = df['genotype']
-        genolist = []
-        for g in geno:
-            genolist.append(" ".join(list(g)))
-        out.write("%s\t%s\n" % (inf.split("/")[-1], "\t".join(genolist)))
-    out.close()
-
-    # Generate the MAP file - chromosome and position of each SNP in the same
-    # order as the PED file
-    mapf = iotools.open_file(outfiles[1], "w")
-    slist = df['snp']
-    for snp in slist:
-        mapf.write("%s\t%s\t%s\n" % (chromposdict[snp][0], snp,
-                                     chromposdict[snp][1]))
-    mapf.close()
-
-
-def CalculateFamily(infile, outfile):
-    '''
-    Translates and filters the output from King.
-    Pairs of related samples are written to the output file along with the
-    degree of relatedness.  Degrees are decided using thresholds from
-    the King documentation, here
-    http://people.virginia.edu/~wc9c/KING/manual.html
-    '''
-    out = iotools.open_file(outfile, "w")
-    input = open(infile).readlines()[1:]
-    for line in input:
-        line = line.strip().split("\t")
-        score = float(line[14])
-        # if there is a significant relatedness score
-        if score >= 0.0442:
-            ID1 = line[0]
-            ID2 = line[2]
-            # 2nd degree relatives
-            if score <= 0.177:
-                degree = 2
-            # 1st degree relatives
-            elif score <= 0.354:
-                degree = 1
-            # 0th degree relatives (ID twins or duplicate)
-            else:
-                degree = 0
-            out.write("%s\t%s\t%s\n" % (ID1, ID2, degree))
-    out.close()
-
-
-@cluster_runnable
-def CalculateSex(infiles, outfile):
-    '''
-    Approximates the sex of the sample using the data in the variant table.
-    Basic estimate based on heterozygosity on the X chromosome - genotypes
-    are taken for all called variants on X passing QC and the percentage
-    of heterozygotes is taken.
-    This tends to produce two clear populations so Kmeans clustering
-    is used to split the data into two - male and female.  Samples which are
-    unclear are marked in the output.
-    '''
-    percs = []
-    for f in infiles:
-        homs = 0
-        tot = 0
-        # Count the homozygous and total variants called on the
-        # X chromosome in each input file (after QC)
-        with iotools.open_file(f) as inf:
-            for line in inf:
-                line = line.strip().split("\t")
-                if line[0] == "chrX" and line[4] == "PASS":
-                    geno = line[7]
-                    s = set(list(geno))
-                    if len(s) == 2:
-                        homs += 1
-                    tot += 1
-        perc = float(homs) / float(tot)
-        percs.append(perc)
-
-    # impose a grid on the data so clustering is possible
-    ests = np.array(percs)
-    ones = np.array([1] * len(ests))
-    grid = np.column_stack((ones, ests))
-
-    # divide data into two clusters
-    means = KMeans(n_clusters=2)
-    fit1 = means.fit(grid)
-    res = fit1.predict(grid)
-
-    # find the position of the cluster centres on the y axis
-    ccs = fit1.cluster_centers_
-    cc1 = ccs[0, 1]
-    cc2 = ccs[1, 1]
-    ccs = [cc1, cc2]
-
-    # pick out the scores from each cluster and calculate the standard dev
-    s1 = ests[res == 0]
-    s2 = ests[res == 1]
-    sds = [np.std(s1), np.std(s2)]
-    # males will be more homozygous - the bigger value represents males
-    if cc1 > cc2:
-        sexes = ["male", "female"]
-    else:
-        sexes = ["female", "male"]
-
-    x = 0
-    out = iotools.open_file(outfile, "w")
-    for f in infiles:
-        # for each input file take the group, the score, the standard deviation
-        # of this group, the distance from the cluster centre and the sex
-        group = res[x]
-        est = ests[x]
-        std = sds[group]
-        dist = est - ccs[group]
-        sex = sexes[group]
-
-        # is the value with 3 SDs of the mean for this sex
-        if sex == "male":
-            if ((3 * std) - abs(dist) >= 0 or dist >= 0):
-                sig = "*"
-            else:
-                sig = "-"
-        else:
-            if ((3 * std) - abs(dist) >= 0 or dist <= 0):
-                sig = "*"
-            else:
-                sig = "-"
-        out.write("%s\t%f\t%f\t%s\t%s\n" % (f, est, dist, sexes[group], sig))
-        x += 1
-    out.close()
-
-    # plot the output - blue dots male, red dots female
-    # lines represent 3 standard deviations from the cluster centre
-    p = plt.figure()
-    a = p.add_subplot('111')
-
-    a.axis([0, len(infiles),
-            (min([min(s1), min(s2)]) - 0.1), (max([max(s1), max(s2)]) + 0.1)])
-
-    dist1 = np.linspace(0, len(infiles), len(s1))
-    dist2 = np.linspace(0, len(infiles), len(s2))
-
-    a.plot(dist1, s1, 'b.')
-    a.plot(dist2, s2, 'r.')
-
-    top = ccs[0] + (3 * sds[0])
-    bottom = ccs[1] - (3 * sds[1])
-    a.plot([0, len(infiles)], [top, top], 'b-')
-    a.plot([0, len(infiles)], [bottom, bottom], 'r-')
-    a.tick_params(axis='x', labelbottom='off', bottom='off')
-    a.yaxis.set_label_text('percentage homozygous X variants')
-    fig = outfile.replace(".tsv", ".png")
-    p.savefig(fig)
-
-
-def PlotAncestry(infile):
-    '''
-    Draws a plot showing the score (probabilty) for each individual
-    for their assigned ancestry and the second closest match.
-    x = individual
-    y = score
-    large diamonds represent the best match and small triangles the second
-    best match
-    '''
-    ancestry = pd.read_csv(infile, sep="\t", header=None, dtype="str")
-
-    # score using the index of the probability e.g. 5e-1000 as -1000
-    ancestry[2] = [int(a[1]) * -1 for a in ancestry[2].str.split("-")]
-    ancestry[4] = [int(a[1]) * -1 for a in ancestry[4].str.split("-")]
-
-    # sort by assigned ancestry
-    ancestry = ancestry.sort_values([1])
-    ancs = list(set(ancestry[1].append(ancestry[3])))
-
-    # assign colours to points by ancestry
-    ancestry[5] = ancestry[1].apply(lambda x: ancs.index(x), 1)
-    ancestry[6] = ancestry[3].apply(lambda x: ancs.index(x), 1)
-    floats = np.linspace(0, 1, len(set(ancs)))
-    colours = cm.jet(floats)
-
-    f = plt.figure(figsize=(30, 15))
-    a = f.add_subplot('111')
-    ymin = min(ancestry[2]) - 100
-    ymax = max(ancestry[4]) + 100
-
-    a.axis([-5, len(ancestry) + 5, ymin, ymax])
-
-    # plot best scoring ancestry
-    a.scatter(list(range(len(ancestry))),
-              ancestry[2], c=colours[ancestry[5].values],
-              s=500, marker='D', edgecolors='None')
-
-    # plot second best scoring ancestry
-    a.scatter(list(range(len(ancestry))),
-              ancestry[4], c=colours[ancestry[6].values],
-              s=150, marker="^", edgecolors='None')
-    sancs = set(ancestry[1])
-    i = 20
-
-    # draw lines between individuals assigned to different ancestries
-    for anc in sancs:
-        p = max(np.where(ancestry[1] == anc)[0]) + 0.5
-        p2 = min(np.where(ancestry[1] == anc)[0]) + 0.5
-        a.plot([p, p],
-               [ymin, ymax], 'k-')
-        a.text(p2, ymax - i, anc, ha='center')
-        i += 10
-
-    # legend
-    ps = [mpatches.Patch(color=col) for col in colours]
-    a.legend(handles=ps, labels=ancs, loc=3)
-
-    f.savefig(infile.replace(".tsv", ".png"))
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 1022d605a..fbdb046de 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -179,7 +179,6 @@
 import cgatpipelines.tasks.mapping as mapping
 import cgatpipelines.tasks.bamstats as bamstats
 import cgatpipelines.tasks.exome as exome
-import cgatpipelines.tasks.exomeancestry as exomeancestry
 
 ###############################################################################
 ###############################################################################
@@ -719,16 +718,19 @@ def annotateVariantsClinvar(infile, outfile):
 
 
 @transform(annotateVariantsClinvar,
-           regex(r"variants/all_samples_exac.snpsift_2.vcf.gz"),
-           r"variants/all_samples_gwasc.snpsift_3.vcf.gz")
+           regex(r"variants/all_samples.snpsift_2.vcf.gz"),
+           r"variants/all_samples.snpsift_3.vcf.gz")
 def annotateVariantsGWASC(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
-
     gwas_catalog = PARAMS["annotation_gwas_catalog"]
+    outfile = P.snip(outfile,".gz")
+
     statement = """SnpSift gwasCat -db %(gwas_catalog)s
-                   %(infile)s > %(outfile)s;"""
+    %(infile)s > %(outfile)s  2> %(outfile)s.log;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz;"""
     P.run(statement)
 
 
@@ -743,10 +745,13 @@ def annotateVariantsPhastcons(infile, outfile):
         PARAMS['genome_dir'],
         PARAMS['genome'])
     phastcons = PARAMS["annotation_phastcons"]
-    intout = outfile.replace("samples", "samples_phastc")
+    outfile = P.snip(outfile,".gz")
+
     statement = """ln -sf %(genome_index)s %(phastcons)s/genome.fai &&
-                   SnpSift phastCons %(phastcons)s %(infile)s >
-                   %(outfile)s;"""
+    SnpSift phastCons %(phastcons)s %(infile)s >
+    %(outfile)s  2> %(outfile)s.log;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz;"""
     P.run(statement)
 
 
@@ -757,6 +762,7 @@ def annotateVariants1000G(infile, outfile):
     '''Add annotations using SNPsift'''
     job_memory = "6G"
     job_threads = PARAMS["annotation_threads"]
+    outfile = P.snip(outfile,".gz")
 
     vcfs = []
     for f in os.listdir(PARAMS["annotation_tgdir"]):
@@ -776,7 +782,10 @@ def annotateVariants1000G(infile, outfile):
         P.run(statement)
 
     shutil.move(tempin, outfile)
-
+    statement = '''  bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz''';
+    P.run(statement)
+ 
 
 @transform(annotateVariants1000G,
            regex(r"variants/all_samples.snpsift_5.vcf.gz"),
@@ -795,7 +804,7 @@ def annotateVariantsSNPsift():
 
 
 @transform(annotateVariantsDBSNP,
-           regex(r"variants/all_samples_dbsnp.snpsift.vcf"),
+           regex(r"variants/all_samples.snpsift_final.vcf.gz"),
            r"variants/all_samples.vep.vcf")
 def annotateVariantsVEP(infile, outfile):
     '''
@@ -935,13 +944,13 @@ def damageFilterVariants(infiles, outfiles):
 
 @follows(mkdir("variant_tables_family"))
 @transform(damageFilterVariants, regex("variant_tables_damaging/(.*).tsv"),
-           add_inputs(calculateFamily),
            [r'variant_tables_family/\1.tsv',
             r'variant_tables_family/\1_failed.tsv'])
 def familyFilterVariants(infiles, outfiles):
     '''
     Filter variants according to the output of calculateFamily -
     only variants shared by both members of a family will be kept.
+    BROKEN - NEED TO ADD FAMILY INPUTS
     '''
     if len(matches) > 1:
         infile = infiles[0][0]
@@ -1650,8 +1659,7 @@ def loadVCFstats(infiles, outfile):
 
 
 @follows(loadVariantAnnotation,
-         finalVariantTables,
-         ancestry)
+         finalVariantTables)
 def testFromVariantRecal():
     pass
 
@@ -1740,7 +1748,6 @@ def vcfstats():
          callVariants,
          annotation,
          filtering,
-         ancestry,
          makeAnnotationsTables)
 def full():
     pass

From 5f081d9f4e889c06f0e58d2a1766378e2f4d26c4 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Thu, 12 May 2022 15:55:35 +0100
Subject: [PATCH 27/43] snpsift bugfixes

---
 cgatpipelines/tasks/exome.py                  |  3 +-
 cgatpipelines/tools/pipeline_exome.py         | 47 ++++++++++++-------
 .../tools/pipeline_exome/pipeline.yml         |  2 +-
 3 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index 0afbefc89..a594af598 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -340,7 +340,8 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
 def snpSift(infile, outfile, vcf, bgzip=True, memory="6G"):
     '''Annotate VCF with other VCF'''
     job_memory=memory
-    statement = '''SnpSift annotate %(vcf)s
+    statement = '''SnpSift annotate 
+    -Xmx%(job_memory)s %(vcf)s
     %(infile)s > %(outfile)s 2> %(outfile)s.log;'''
     if bgzip:
         statement +=  '''bgzip %(outfile)s;
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index fbdb046de..7fb8ade59 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -196,8 +196,8 @@
     "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz")
 
 PICARD_MEMORY = PARAMS["picard_memory"]
-
 GATK_MEMORY = PARAMS["gatk_memory"]
+ANNOTATION_MEMORY = PARAMS["annotation_memory"]
 
 
 ###############################################################################
@@ -648,7 +648,7 @@ def ancestrySomalier(infile,outfile):
            r"variants/all_samples.snpeff.vcf.gz")
 def annotateVariantsSNPeff(infile, outfile):
     '''Annotate variants using SNPeff'''
-    job_memory = PARAMS["annotation_memory"]
+    job_memory = ANNOTATION_MEMORY
     job_threads = PARAMS["annotation_threads"]
     snpeff_genome = PARAMS["annotation_snpeff_genome"]
     config = PARAMS["annotation_snpeff_config"]
@@ -669,7 +669,7 @@ def vcfToTableSnpEff(infile, outfile):
     '''Converts vcf to tab-delimited file'''
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     columns = PARAMS["annotation_snpeff_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
+    exome.vcfToTable(infile, outfile, genome, columns, ANNOTATION_MEMORY)
 
 
 @jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
@@ -687,7 +687,7 @@ def loadTableSnpEff(infile, outfile):
            r"variants/all_samples.snpsift_1.vcf.gz")
 def annotateVariantsDBNSFP(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
+    job_memory = ANNOTATION_MEMORY
     job_threads = PARAMS["annotation_threads"]
     dbNSFP = PARAMS["annotation_dbnsfp"]
     dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"]
@@ -697,7 +697,8 @@ def annotateVariantsDBNSFP(infile, outfile):
     else:
         annostring = "-f %s" % dbNSFP_columns
 
-    statement = '''SnpSift dbnsfp 
+    statement = '''SnpSift dbnsfp
+    -Xmx%(job_memory)s
     -db %(dbNSFP)s -v %(infile)s
     %(annostring)s >
     %(outfile)s 2> %(outfile)s.log;
@@ -714,7 +715,7 @@ def annotateVariantsClinvar(infile, outfile):
     job_threads = PARAMS["annotation_threads"]
     clinvar = PARAMS["annotation_clinvar"]
     outfile = P.snip(outfile,".gz")
-    exome.snpSift(infile,outfile,clinvar)
+    exome.snpSift(infile,outfile,clinvar, memory=ANNOTATION_MEMORY)
 
 
 @transform(annotateVariantsClinvar,
@@ -722,12 +723,14 @@ def annotateVariantsClinvar(infile, outfile):
            r"variants/all_samples.snpsift_3.vcf.gz")
 def annotateVariantsGWASC(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
+    job_memory = ANNOTATION_MEMORY
     job_threads = PARAMS["annotation_threads"]
     gwas_catalog = PARAMS["annotation_gwas_catalog"]
     outfile = P.snip(outfile,".gz")
 
-    statement = """SnpSift gwasCat -db %(gwas_catalog)s
+    statement = """SnpSift gwasCat
+    -Xmx%(job_memory)s 
+    -db %(gwas_catalog)s
     %(infile)s > %(outfile)s  2> %(outfile)s.log;
     bgzip %(outfile)s;
     tabix -p vcf %(outfile)s.gz;"""
@@ -739,7 +742,7 @@ def annotateVariantsGWASC(infile, outfile):
            r"variants/all_samples.snpsift_4.vcf.gz")
 def annotateVariantsPhastcons(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
+    job_memory = ANNOTATION_MEMORY
     job_threads = PARAMS["annotation_threads"]
     genome_index = "%s/%s.fa.fai" % (
         PARAMS['genome_dir'],
@@ -748,7 +751,9 @@ def annotateVariantsPhastcons(infile, outfile):
     outfile = P.snip(outfile,".gz")
 
     statement = """ln -sf %(genome_index)s %(phastcons)s/genome.fai &&
-    SnpSift phastCons %(phastcons)s %(infile)s >
+    SnpSift phastCons 
+    -Xmx%(job_memory)s 
+    %(phastcons)s %(infile)s >
     %(outfile)s  2> %(outfile)s.log;
     bgzip %(outfile)s;
     tabix -p vcf %(outfile)s.gz;"""
@@ -760,7 +765,7 @@ def annotateVariantsPhastcons(infile, outfile):
            r"variants/all_samples.snpsift_5.vcf.gz")
 def annotateVariants1000G(infile, outfile):
     '''Add annotations using SNPsift'''
-    job_memory = "6G"
+    job_memory = ANNOTATION_MEMORY
     job_threads = PARAMS["annotation_threads"]
     outfile = P.snip(outfile,".gz")
 
@@ -769,16 +774,22 @@ def annotateVariants1000G(infile, outfile):
         if f.endswith(".vcf.gz"):
             vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
 
-    T = P.get_temp_filename(".")
-    shutil.copy(infile, T)
-    tempin = T
+    tempin = P.get_temp_filename(".")
+    statement='''cp %(infile)s %(tempin)s.gz &&
+    tabix -p vcf %(tempin)s.gz'''
+    P.run(statement)
+
     tempout = P.get_temp_filename(".")
 
     for vcf in vcfs:
         statement = """SnpSift annotate
-                       %(vcf)s
-                       %(tempin)s > %(tempout)s &&
-                       mv %(tempout)s %(tempin)s"""
+        -Xmx%(job_memory)s 
+        %(vcf)s
+        %(tempin)s.gz > %(tempout)s  
+        2>> %(outfile)s.log &&
+        mv %(tempout)s %(tempin)s &&
+        bgzip -f %(tempin)s &&
+        tabix -f %(tempin)s.gz"""
         P.run(statement)
 
     shutil.move(tempin, outfile)
@@ -794,7 +805,7 @@ def annotateVariantsDBSNP(infile, outfile):
     '''Add annotations using SNPsift'''
     job_threads = PARAMS["annotation_threads"]
     dbsnp = PARAMS["annotation_dbsnp"]
-    exome.snpSift(infile,outfile,dbsnp)
+    exome.snpSift(infile,outfile,dbsnp,memory=ANNOTATION_MEMORY)
     P.run(statement)
 
 
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 03d967266..0d267c94a 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -110,7 +110,7 @@ annotation:
 
     memory: 6G
 
-    threads: 3
+    threads: 1
 
     #Config file specifies that the genome must be downloaded into your home directory
     snpeff_config: path/to/snpEff-VERSION/snpEff.config

From a589335d99db911425423458354a28e196eca756 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Fri, 13 May 2022 12:51:06 +0100
Subject: [PATCH 28/43] Add SpliceAI

---
 cgatpipelines/tools/pipeline_exome.py | 62 ++++++++++++++++++++-------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 7fb8ade59..8b1c91aec 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -792,21 +792,19 @@ def annotateVariants1000G(infile, outfile):
         tabix -f %(tempin)s.gz"""
         P.run(statement)
 
-    shutil.move(tempin, outfile)
-    statement = '''  bgzip %(outfile)s;
+    statement = '''  mv %(tempin)s.gz %(outfile)s &&
     tabix -p vcf %(outfile)s.gz''';
     P.run(statement)
  
 
 @transform(annotateVariants1000G,
            regex(r"variants/all_samples.snpsift_5.vcf.gz"),
-           r"variants/all_samples_dbsnp.snpsift_final.vcf.gz")
+           r"variants/all_samples.snpsift_final.vcf.gz")
 def annotateVariantsDBSNP(infile, outfile):
     '''Add annotations using SNPsift'''
     job_threads = PARAMS["annotation_threads"]
     dbsnp = PARAMS["annotation_dbsnp"]
     exome.snpSift(infile,outfile,dbsnp,memory=ANNOTATION_MEMORY)
-    P.run(statement)
 
 
 @follows(annotateVariantsDBSNP)
@@ -833,21 +831,55 @@ def annotateVariantsVEP(infile, outfile):
     vep_cache = PARAMS["annotation_vepcache"]
     vep_species = PARAMS["annotation_vepspecies"]
     vep_assembly = PARAMS["annotation_vepassembly"]
-    if len(vep_annotators) != 0:
-        annostring = vep_annotators
-        statement = '''perl %(vep_path)s/variant_effect_predictor.pl
-                       --cache --dir %(vep_cache)s --vcf
-                       --species %(vep_species)s
-                       --fork 2
-                       --assembly %(vep_assembly)s --input_file %(infile)s
-                       --output_file %(outfile)s --force_overwrite
-                       %(annostring)s --offline;'''
-        P.run(statement)
+    statement = '''vep
+    -i %(infile)s
+    --cache --dir %(vep_cache)s --vcf
+    --species %(vep_species)s
+    --fork 2
+    --compress_output bgzip
+    --assembly %(vep_assembly)s
+    -o %(outfile)s --force_overwrite
+    %(annostring)s;''' 
+    P.run(statement)
+
+
+
+
+@transform(annotateVariantsVEP,
+           regex(r"variants/all_samples.vep.vcf.gz"),
+           r"variants/all_samples.SpliceAI.vcf.gz")
+def annotateVariantsSpliceAI(infile, outfile):
+    '''
+    Adds annotations as specified in the pipeline.yml using Ensembl
+    variant effect predictor (VEP).
+    '''
+    # infile - VCF
+    # outfile - VCF with vep annotations
+    job_memory = "6G"
+    job_threads = 4
+
+    vep_annotators = PARAMS["annotation_vepannotators"]
+    vep_path = PARAMS["annotation_veppath"]
+    vep_cache = PARAMS["annotation_vepcache"]
+    vep_species = PARAMS["annotation_vepspecies"]
+    vep_assembly = PARAMS["annotation_vepassembly"]
+    statement = '''vep 
+    -i %(infile)s
+    --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s
+    --cache --dir %(vep_cache)s --vcf
+    --species %(vep_species)s
+    --fork 2
+    --compress_output bgzip
+    --assembly %(vep_assembly)s
+    -o %(outfile)s --force_overwrite
+    %(annostring)s;'''
+    P.run(statement)
+
 
 
 @follows(mkdir("variant_tables"))
 @transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
-           add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv")
+           add_inputs(annotateVariantsSpliceAI), r"variant_tables/\1.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
     Converts the multi sample vcf generated with Haplotype caller into

From 39e993e58d63691bd82ff33eb7c7ae125402df59 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 16 May 2022 10:27:38 +0100
Subject: [PATCH 29/43] further exome pipeline - removed pedigree analysis -
 now also runs VEP on vcf - annotation pipeline completed

---
 cgatpipelines/tools/pipeline_exome.py         | 583 +-----------------
 .../tools/pipeline_exome/pipeline.yml         |   9 +-
 .../tools/pipeline_exome_pedigree.py          | 491 +++++++++++++++
 3 files changed, 508 insertions(+), 575 deletions(-)
 create mode 100644 cgatpipelines/tools/pipeline_exome_pedigree.py

diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 8b1c91aec..27164b985 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -792,7 +792,7 @@ def annotateVariants1000G(infile, outfile):
         tabix -f %(tempin)s.gz"""
         P.run(statement)
 
-    statement = '''  mv %(tempin)s.gz %(outfile)s &&
+    statement = '''  mv %(tempin)s.gz %(outfile)s.gz &&
     tabix -p vcf %(outfile)s.gz''';
     P.run(statement)
  
@@ -804,6 +804,7 @@ def annotateVariantsDBSNP(infile, outfile):
     '''Add annotations using SNPsift'''
     job_threads = PARAMS["annotation_threads"]
     dbsnp = PARAMS["annotation_dbsnp"]
+    outfile = P.snip(outfile,".gz")
     exome.snpSift(infile,outfile,dbsnp,memory=ANNOTATION_MEMORY)
 
 
@@ -814,72 +815,43 @@ def annotateVariantsSNPsift():
 
 @transform(annotateVariantsDBSNP,
            regex(r"variants/all_samples.snpsift_final.vcf.gz"),
-           r"variants/all_samples.vep.vcf")
+           r"variants/all_samples.vep.vcf.gz")
 def annotateVariantsVEP(infile, outfile):
-    '''
-    Adds annotations as specified in the pipeline.yml using Ensembl
-    variant effect predictor (VEP).
-    '''
+   
+    #Adds annotations as specified in the pipeline.yml using Ensembl
+    #variant effect predictor (VEP).
+    
     # infile - VCF
     # outfile - VCF with vep annotations
     job_memory = "6G"
     job_threads = 4
 
-    VEP = PARAMS["annotation_vepannotators"].split(",")
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     vep_annotators = PARAMS["annotation_vepannotators"]
-    vep_path = PARAMS["annotation_veppath"]
     vep_cache = PARAMS["annotation_vepcache"]
     vep_species = PARAMS["annotation_vepspecies"]
     vep_assembly = PARAMS["annotation_vepassembly"]
+    spliceAI_snv = PARAMS["annotation_spliceAI_snv"]
+    spliceAI_indel = PARAMS["annotation_spliceAI_indel"]
     statement = '''vep
     -i %(infile)s
     --cache --dir %(vep_cache)s --vcf
-    --species %(vep_species)s
-    --fork 2
-    --compress_output bgzip
-    --assembly %(vep_assembly)s
-    -o %(outfile)s --force_overwrite
-    %(annostring)s;''' 
-    P.run(statement)
-
-
-
-
-@transform(annotateVariantsVEP,
-           regex(r"variants/all_samples.vep.vcf.gz"),
-           r"variants/all_samples.SpliceAI.vcf.gz")
-def annotateVariantsSpliceAI(infile, outfile):
-    '''
-    Adds annotations as specified in the pipeline.yml using Ensembl
-    variant effect predictor (VEP).
-    '''
-    # infile - VCF
-    # outfile - VCF with vep annotations
-    job_memory = "6G"
-    job_threads = 4
-
-    vep_annotators = PARAMS["annotation_vepannotators"]
-    vep_path = PARAMS["annotation_veppath"]
-    vep_cache = PARAMS["annotation_vepcache"]
-    vep_species = PARAMS["annotation_vepspecies"]
-    vep_assembly = PARAMS["annotation_vepassembly"]
-    statement = '''vep 
-    -i %(infile)s
     --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s
-    --cache --dir %(vep_cache)s --vcf
     --species %(vep_species)s
     --fork 2
     --compress_output bgzip
     --assembly %(vep_assembly)s
-    -o %(outfile)s --force_overwrite
-    %(annostring)s;'''
+    -o %(outfile)s --force_overwrite'''
+    if vep_annotators is not None:
+        statement += "  %(vep_annotators)s" 
+    statement += ''' > %(outfile)s.log 2>&1;
+    tabix -p vcf %(outfile)s'''
     P.run(statement)
 
 
-
 @follows(mkdir("variant_tables"))
 @transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
-           add_inputs(annotateVariantsSpliceAI), r"variant_tables/\1.tsv")
+           add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
     Converts the multi sample vcf generated with Haplotype caller into
@@ -1127,491 +1099,6 @@ def loadVariantAnnotation(infile, outfile):
     '''Load VCF annotations into database'''
     P.load(infile, outfile, options="--retry --ignore-empty")
 
-##############################################################################
-###############################################################################
-###############################################################################
-# Confirm parentage (do novo trios only)
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.parentage")
-def confirmParentage(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t', fieldnames=[
-        'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    trio = P.snip(os.path.basename(pedfile), ".ped")
-    trio = trio.replace(".", "_").replace("-", "_")
-    database = PARAMS["database_name"]
-    proband = None
-    mother = None
-    father = None
-    for row in pedigree:
-        if row['status'] == '2':
-            proband = row['sample'].replace(".", "_").replace("-", "_")
-            mother = row['mother'].replace(".", "_").replace("-", "_")
-            father = row['father'].replace(".", "_").replace("-", "_")
-    E.info("proband:" + proband)
-    E.info("mother:" + mother)
-    E.info("father:" + father)
-
-    query = '''SELECT '%(trio)s' as family,
-               hom_nonref_trio/(hom_nonref_parents+0.0) as hom_nonref_conc,
-               child_mat_het_nonref/(maternal_hom_nonref+0.0) as maternal_conc,
-               child_pat_het_nonref/(paternal_hom_nonref+0.0) as paternal_conc,
-                *
-               FROM
-               (SELECT count(*) as hom_nonref_trio
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as hom_nonref_parents
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as maternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_mat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as paternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_pat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10)''' % locals()
-    statement = '''sqlite3 %(database)s "%(query)s" > %(outfile)s
-                   2> %(outfile)s.log''' % locals()
-    P.run(statement)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# De novos
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.filtered.vcf")
-def deNovoVariants(infiles, outfile):
-    '''Filter de novo variants based on provided jexl expression'''
-    job_memory = GATK_MEMORY
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3&&((vc.getGenotype("%(child)s").getAD().1)/(vc.getGenotype("%(child)s").getDP().floatValue()))>=0.25&&(vc.getGenotype("%(father)s").getAD().1==0||(vc.getGenotype("%(father)s").getAD().1>0&&((vc.getGenotype("%(father)s").getAD().1)/(vc.getGenotype("%(father)s").getDP().floatValue()))<0.05))&&(vc.getGenotype("%(mother)s").getAD().1==0||(vc.getGenotype("%(mother)s").getAD().1>0&&((vc.getGenotype("%(mother)s").getAD().1)/(vc.getGenotype("%(mother)s").getDP().floatValue()))<0.05))&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(deNovoVariants,
-           regex(r"variants/(\S+).filtered.vcf"),
-           r"variants/\1.filtered.table")
-def tabulateDeNovos(infile, outfile):
-    '''Tabulate de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDeNovos,
-           regex(r"variants/(\S+).filtered.table"),
-           r"variants/\1.filtered.table.load")
-def loadDeNovos(infile, outfile):
-    '''Load de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.denovos.vcf")
-def lowerStringencyDeNovos(infiles, outfile):
-    '''Filter lower stringency de novo variants based on provided jexl
-    expression'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(lowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.vcf"),
-           r"variants/\1.denovos.table")
-def tabulateLowerStringencyDeNovos(infile, outfile):
-    '''Tabulate lower stringency de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateLowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.table"),
-           r"variants/\1.denovos.table.load")
-def loadLowerStringencyDeNovos(infile, outfile):
-    '''Load lower stringency de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Dominant
-
-
-@follows(loadVariantAnnotation)
-@transform("*Multiplex*.ped",
-           regex(r"(\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.dominant.vcf")
-def dominantVariants(infiles, outfile):
-    '''Filter variants according to autosomal dominant disease model'''
-    pedfile, infile = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-        if row['status'] == '1':
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
-    affecteds_exp2 = '").getAD().1>=1&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").isHomRef()'
-    # for some weird reason the 1000G filter doesn't work on these particular
-    # files - will add later when I've figured out what's wrong
-    # currently 1000G filter is performed at the report stage (not in csvdb)
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().1==0&&vc.getGenotype("%(affecteds_exp2)s").getAD().1>=1%(unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(dominantVariants,
-           regex(r"variants/(\S+).dominant.vcf"),
-           r"variants/\1.dominant.table")
-def tabulateDoms(infile, outfile):
-    '''Tabulate dominant disease candidate variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDoms, regex(r"variants/(\S+).dominant.table"),
-           r"variants/\1.dominant.table.load")
-def loadDoms(infile, outfile):
-    '''Load dominant disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Recessive
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped",
-           regex(
-               r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.recessive.vcf")
-def recessiveVariants(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    parents = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-            if row['father'] != '0':
-                parents += [row['father']]
-            if row['mother'] != '0':
-                parents += [row['mother']]
-        elif row['status'] == '1' and row['sample'] not in parents:
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").getPL().2!=0'
-    if len(parents) == 0:
-        parents_exp = ''
-    else:
-        parents_exp = '&&vc.getGenotype("' + \
-            ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
-            '").getPL().1==0'
-    # need a way of specifying that other unaffecteds eg. sibs can't be
-    # homozygous for alt allele
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().2==0%(unaffecteds_exp)s%(parents_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(recessiveVariants,
-           regex(r"variants/(\S+).recessive.vcf"),
-           r"variants/\1.recessive.table")
-def tabulateRecs(infile, outfile):
-    '''Tabulate potential homozygous recessive disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateRecs, regex(r"variants/(\S+).recessive.table"),
-           r"variants/\1.recessive.table.load")
-def loadRecs(infile, outfile):
-    '''Load homozygous recessive disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# X-linked
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped", regex(r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.xlinked.vcf")
-def xlinkedVariants(infiles, outfile):
-    '''Find maternally inherited X chromosome variants in male patients'''
-    track = P.snip(os.path.basename(outfile), ".vcf")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    mothers = []
-    male_unaffecteds = []
-    female_unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2' and row['mother'] != '0':
-            if row['mother'] not in mothers:
-                mothers += [row['mother']]
-            affecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] == '1':
-            male_unaffecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] != '1' and row['sample'] not in mothers:
-            female_unaffecteds += [row['sample']]
-    if len(affecteds) == 0:
-        affecteds_exp = ''
-    else:
-        affecteds_exp = '").isHomRef()&&!vc.getGenotype("'.join(affecteds)
-#        affecteds_exp = '&&!vc.getGenotype("' + \
-#        ('").isHomRef()&&!vc.getGenotype("'.join(affecteds)) + \
-#        '").isHomRef()'
-    if len(mothers) == 0:
-        mothers_exp = ''
-    else:
-        mothers_exp = '&&vc.getGenotype("' + \
-                      ('").getPL().1==0&&vc.getGenotype("'.join(mothers)) + \
-                      '").getPL().1==0'
-    if len(male_unaffecteds) == 0:
-        male_unaffecteds_exp = ''
-    else:
-        male_unaffecteds_exp = '&&vc.getGenotype("' + \
-                               ('").isHomRef()&&vc.getGenotype("'.join(
-                                   male_unaffecteds)) + \
-                               '").isHomRef()'
-    if len(female_unaffecteds) == 0:
-        female_unaffecteds_exp = ''
-    else:
-        female_unaffecteds_exp = '&&vc.getGenotype("' + \
-                                 ('").getPL().2!=0&&vc.getGenotype("'.join(
-                                     female_unaffecteds)) + \
-            '").getPL().2!=0'
-    select = '''CHROM=="X"&&!vc.getGenotype("%(affecteds_exp)s").isHomRef()%(mothers_exp)s%(male_unaffecteds_exp)s%(female_unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(xlinkedVariants,
-           regex(r"variants/(\S+).xlinked.vcf"),
-           r"variants/\1.xlinked.table")
-def tabulateXs(infile, outfile):
-    '''Tabulate potential X-linked disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateXs, regex(r"variants/(\S+).xlinked.table"),
-           r"variants/\1.xlinked.table.load")
-def loadXs(infile, outfile):
-    '''Load X-linked disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Compound hets
-
-
-# why does this not work from snpsift VCF file? DS
-# @transform(annotateVariantsSNPsift,
-#    regex(r"variants/(\S*Trio\S+|\S*Multiplex\S+).haplotypeCaller.snpsift.vcf"),
-@transform(annotateVariantsSNPeff,
-           regex(
-               r"variants/all_samples.snpeff.vcf"),
-           add_inputs(r"all_samples.ped", r"gatk/all_samples.list"),
-           r"variants/all_samples.phased.vcf")
-def phasing(infiles, outfile):
-    '''phase variants with GATK'''
-    infile, pedfile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T PhaseByTransmission
-                   -R %(genome)s
-                   -V %(infile)s
-                   -ped %(pedfile)s
-                   -mvf %(infile)s.mvf
-                   -o %(outfile)s ;''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@transform(phasing, regex(r"variants/all_samples.phased.vcf"),
-           add_inputs(r"gatk/all_samples.list"),
-           r"variants/all_samples.rbp.vcf")
-def readbackedphasing(infiles, outfile):
-    '''phase variants with ReadBackedPhasing'''
-    job_memory = "32G"
-    infile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4
-                   -R %(genome)s
-                   -I %(bamlist)s
-                   -V %(infile)s
-                   -o %(outfile)s; ''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@follows(readbackedphasing)
-@transform("*.ped",
-           regex(r"(\S*Multiplex\S+|\S*Trio\S+).ped"),
-           add_inputs(r"no_multiallelic_all_samples.rbp.vcf",
-                      r"all_samples.ped"),
-           r"variants/\1.compound_hets.table")
-def compoundHets(infiles, outfile):
-    '''Identify potentially pathogenic compound heterozygous variants
-    (phasing with GATK followed by compound het calling using Gemini'''
-    family, infile, pedfile = infiles
-    family_id = P.snip(os.path.basename(family), ".ped")
-    statement = '''gemini load -v %(infile)s
-                   -p %(pedfile)s -t snpEff %(family_id)s.db &&
-                   gemini comp_hets
-                   --families %(family_id)s
-                   --columns "chrom, start, end, ref, alt, codon_change, gene, qual, depth"
-                   --filter
-                   "(impact_severity = 'HIGH' OR impact_severity = 'MED')
-                   AND (in_esp = 0 OR aaf_esp_all < 0.01)
-                   AND (in_1kg = 0 OR aaf_1kg_all < 0.01)"
-                   %(family_id)s.db > %(outfile)s;'''
-    # rm -f %(family_id)s.db'''
-    P.run(statement)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(compoundHets, regex(r"variants/(\S+).compound_hets.table"),
-           r"variants/\1.compound_hets.table.load")
-def loadCompoundHets(infile, outfile):
-    '''Load compound heterozygous variants into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
 ###############################################################################
 ###############################################################################
 ###############################################################################
@@ -1742,43 +1229,6 @@ def annotation():
     pass
 
 
-@follows(loadDeNovos)
-def denovo():
-    pass
-
-
-@follows(loadLowerStringencyDeNovos)
-def denovo2():
-    pass
-
-
-@follows(loadDoms)
-def dominant():
-    pass
-
-
-@follows(loadRecs)
-def recessive():
-    pass
-
-
-@follows(loadXs)
-def xlinked():
-    pass
-
-
-@follows(loadCompoundHets)
-def compoundHet():
-    pass
-
-
-@follows(denovo,
-         dominant,
-         recessive,
-         compoundHet)
-def filtering():
-    pass
-
 
 @follows(buildVCFstats,
          loadVCFstats)
@@ -1790,7 +1240,6 @@ def vcfstats():
          gatk,
          callVariants,
          annotation,
-         filtering,
          makeAnnotationsTables)
 def full():
     pass
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 0d267c94a..0f7893bee 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -180,7 +180,7 @@ annotation:
     vepspecies: homo_sapiens
 
     # assembly - needs to correspond to data in cache directory
-    vepassembly: GRCh37
+    vepassembly: GRCh38
 
     # filtering of vcfs to select variants of interest
 filtering:
@@ -227,13 +227,6 @@ filtering:
     # columns to keep in the final variant tables (apart from CHROM,POS,ID,REF1,ALT,GT)
     columns: [GQ,DP,FS,SOR,POPMAX_calc,Adj_calc,EUR_AF_calc,EAS_AF_calc,AFR_AF_calc,SAS_AF_calc,ESP_AF_POPMAX_calc,ESP_AF_GLOBAL_calc,CSQ,dbNSFP_MutationTaster_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,SNPEFF_IMPACT]
 
-king:
-    #path to king
-    path: /path/to/king
-
-    #path to plink
-    plink: /path/to/plink-1.07/plink-1.07-x86_64
-
 coverage:
 
     # to activate calculating coverage of candidate genes, specified below.
diff --git a/cgatpipelines/tools/pipeline_exome_pedigree.py b/cgatpipelines/tools/pipeline_exome_pedigree.py
new file mode 100644
index 000000000..0752af118
--- /dev/null
+++ b/cgatpipelines/tools/pipeline_exome_pedigree.py
@@ -0,0 +1,491 @@
+
+'''
+PEDIGREE CODE FROM PIPELINE EXOME
+REMOVED FOR CLARITY
+'''
+
+
+##############################################################################
+###############################################################################
+###############################################################################
+# Confirm parentage (do novo trios only)
+
+
+@follows(loadVariantAnnotation)
+@transform("*Trio*.ped",
+           regex(r"(\S*Trio\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.parentage")
+def confirmParentage(infiles, outfile):
+    '''Filter variants according to autosomal recessive disease model'''
+    pedfile, infile = infiles
+    pedigree = csv.DictReader(open(pedfile), delimiter='\t', fieldnames=[
+        'family', 'sample', 'father', 'mother', 'sex', 'status'])
+    trio = P.snip(os.path.basename(pedfile), ".ped")
+    trio = trio.replace(".", "_").replace("-", "_")
+    database = PARAMS["database_name"]
+    proband = None
+    mother = None
+    father = None
+    for row in pedigree:
+        if row['status'] == '2':
+            proband = row['sample'].replace(".", "_").replace("-", "_")
+            mother = row['mother'].replace(".", "_").replace("-", "_")
+            father = row['father'].replace(".", "_").replace("-", "_")
+    E.info("proband:" + proband)
+    E.info("mother:" + mother)
+    E.info("father:" + father)
+
+    query = '''SELECT '%(trio)s' as family,
+               hom_nonref_trio/(hom_nonref_parents+0.0) as hom_nonref_conc,
+               child_mat_het_nonref/(maternal_hom_nonref+0.0) as maternal_conc,
+               child_pat_het_nonref/(paternal_hom_nonref+0.0) as paternal_conc,
+                *
+               FROM
+               (SELECT count(*) as hom_nonref_trio
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND %(mother)s_PL LIKE '%%%%,0'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND  %(father)s_PL LIKE '%%%%,0'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND %(proband)s_PL LIKE '%%%%,0'
+               AND cast(%(proband)s_DP as INTEGER) > 10),
+               (SELECT count(*) as hom_nonref_parents
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND %(mother)s_PL LIKE '%%%%,0'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND  %(father)s_PL LIKE '%%%%,0'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND cast(%(proband)s_DP as INTEGER) > 10),
+               (SELECT count(*) as maternal_hom_nonref
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND %(mother)s_PL LIKE '%%%%,0'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND  %(father)s_PL LIKE '0,%%%%'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND cast(%(proband)s_DP as INTEGER) > 10),
+               (SELECT count(*) as child_mat_het_nonref
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND %(mother)s_PL LIKE '%%%%,0'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND  %(father)s_PL LIKE '0,%%%%'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
+               AND cast(%(proband)s_DP as INTEGER) > 10),
+               (SELECT count(*) as paternal_hom_nonref
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND  %(father)s_PL LIKE '%%%%,0'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND  %(mother)s_PL LIKE '0,%%%%'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND cast(%(proband)s_DP as INTEGER) > 10),
+               (SELECT count(*) as child_pat_het_nonref
+               FROM %(trio)s_genes_table
+               where  CHROM NOT IN ('X', 'Y')
+               AND  %(father)s_PL LIKE '%%%%,0'
+               AND cast(%(father)s_DP as INTEGER) > 10
+               AND  %(mother)s_PL LIKE '0,%%%%'
+               AND cast(%(mother)s_DP as INTEGER) > 10
+               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
+               AND cast(%(proband)s_DP as INTEGER) > 10)''' % locals()
+    statement = '''sqlite3 %(database)s "%(query)s" > %(outfile)s
+                   2> %(outfile)s.log''' % locals()
+    P.run(statement)
+
+###############################################################################
+###############################################################################
+###############################################################################
+# De novos
+
+
+@follows(loadVariantAnnotation)
+@transform("*Trio*.ped",
+           regex(r"(\S*Trio\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.filtered.vcf")
+def deNovoVariants(infiles, outfile):
+    '''Filter de novo variants based on provided jexl expression'''
+    job_memory = GATK_MEMORY
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    pedfile, infile = infiles
+    pedigree = csv.DictReader(
+        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
+            'family', 'sample', 'father', 'mother', 'sex', 'status'])
+    for row in pedigree:
+        if row['status'] == '2':
+            father = row['father']
+            mother = row['mother']
+            child = row['sample']
+    select = '''vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3&&((vc.getGenotype("%(child)s").getAD().1)/(vc.getGenotype("%(child)s").getDP().floatValue()))>=0.25&&(vc.getGenotype("%(father)s").getAD().1==0||(vc.getGenotype("%(father)s").getAD().1>0&&((vc.getGenotype("%(father)s").getAD().1)/(vc.getGenotype("%(father)s").getDP().floatValue()))<0.05))&&(vc.getGenotype("%(mother)s").getAD().1==0||(vc.getGenotype("%(mother)s").getAD().1>0&&((vc.getGenotype("%(mother)s").getAD().1)/(vc.getGenotype("%(mother)s").getDP().floatValue()))<0.05))&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
+    exome.selectVariants(infile, outfile, genome, select)
+
+
+@transform(deNovoVariants,
+           regex(r"variants/(\S+).filtered.vcf"),
+           r"variants/\1.filtered.table")
+def tabulateDeNovos(infile, outfile):
+    '''Tabulate de novo variants'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    columns = PARAMS["gatk_vcf_to_table"]
+    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(tabulateDeNovos,
+           regex(r"variants/(\S+).filtered.table"),
+           r"variants/\1.filtered.table.load")
+def loadDeNovos(infile, outfile):
+    '''Load de novos into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")
+
+###############################################################################
+
+
+@follows(loadVariantAnnotation)
+@transform("*Trio*.ped",
+           regex(r"(\S*Trio\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.denovos.vcf")
+def lowerStringencyDeNovos(infiles, outfile):
+    '''Filter lower stringency de novo variants based on provided jexl
+    expression'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    pedfile, infile = infiles
+    pedigree = csv.DictReader(
+        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
+            'family', 'sample', 'father', 'mother', 'sex', 'status'])
+    for row in pedigree:
+        if row['status'] == '2':
+            father = row['father']
+            mother = row['mother']
+            child = row['sample']
+    select = '''vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
+    )
+    exome.selectVariants(infile, outfile, genome, select)
+
+
+@transform(lowerStringencyDeNovos,
+           regex(r"variants/(\S+).denovos.vcf"),
+           r"variants/\1.denovos.table")
+def tabulateLowerStringencyDeNovos(infile, outfile):
+    '''Tabulate lower stringency de novo variants'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    columns = PARAMS["gatk_vcf_to_table"]
+    exome.vcfToTable(infile, outfile, genome, columns)
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(tabulateLowerStringencyDeNovos,
+           regex(r"variants/(\S+).denovos.table"),
+           r"variants/\1.denovos.table.load")
+def loadLowerStringencyDeNovos(infile, outfile):
+    '''Load lower stringency de novos into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")
+
+###############################################################################
+###############################################################################
+###############################################################################
+# Dominant
+
+
+@follows(loadVariantAnnotation)
+@transform("*Multiplex*.ped",
+           regex(r"(\S*Multiplex\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.dominant.vcf")
+def dominantVariants(infiles, outfile):
+    '''Filter variants according to autosomal dominant disease model'''
+    pedfile, infile = infiles
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
+                              fieldnames=['family', 'sample', 'father',
+                                          'mother', 'sex', 'status'])
+    affecteds = []
+    unaffecteds = []
+    for row in pedigree:
+        if row['status'] == '2':
+            affecteds += [row['sample']]
+        if row['status'] == '1':
+            unaffecteds += [row['sample']]
+    affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
+    affecteds_exp2 = '").getAD().1>=1&&vc.getGenotype("'.join(affecteds)
+    if len(unaffecteds) == 0:
+        unaffecteds_exp = ''
+    else:
+        unaffecteds_exp = '&&vc.getGenotype("' + \
+            ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
+            '").isHomRef()'
+    # for some weird reason the 1000G filter doesn't work on these particular
+    # files - will add later when I've figured out what's wrong
+    # currently 1000G filter is performed at the report stage (not in csvdb)
+    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().1==0&&vc.getGenotype("%(affecteds_exp2)s").getAD().1>=1%(unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
+    exome.selectVariants(infile, outfile, genome, select)
+
+###############################################################################
+
+
+@transform(dominantVariants,
+           regex(r"variants/(\S+).dominant.vcf"),
+           r"variants/\1.dominant.table")
+def tabulateDoms(infile, outfile):
+    '''Tabulate dominant disease candidate variants'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    columns = PARAMS["gatk_vcf_to_table"]
+    exome.vcfToTable(infile, outfile, genome, columns)
+
+###############################################################################
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(tabulateDoms, regex(r"variants/(\S+).dominant.table"),
+           r"variants/\1.dominant.table.load")
+def loadDoms(infile, outfile):
+    '''Load dominant disease candidates into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")
+
+###############################################################################
+###############################################################################
+###############################################################################
+# Recessive
+
+
+@follows(loadVariantAnnotation)
+@transform("*.ped",
+           regex(
+               r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.recessive.vcf")
+def recessiveVariants(infiles, outfile):
+    '''Filter variants according to autosomal recessive disease model'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    pedfile, infile = infiles
+    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
+                              fieldnames=['family', 'sample', 'father',
+                                          'mother', 'sex', 'status'])
+    affecteds = []
+    parents = []
+    unaffecteds = []
+    for row in pedigree:
+        if row['status'] == '2':
+            affecteds += [row['sample']]
+            if row['father'] != '0':
+                parents += [row['father']]
+            if row['mother'] != '0':
+                parents += [row['mother']]
+        elif row['status'] == '1' and row['sample'] not in parents:
+            unaffecteds += [row['sample']]
+    affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
+    if len(unaffecteds) == 0:
+        unaffecteds_exp = ''
+    else:
+        unaffecteds_exp = '&&vc.getGenotype("' + \
+            ('").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)) + \
+            '").getPL().2!=0'
+    if len(parents) == 0:
+        parents_exp = ''
+    else:
+        parents_exp = '&&vc.getGenotype("' + \
+            ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
+            '").getPL().1==0'
+    # need a way of specifying that other unaffecteds eg. sibs can't be
+    # homozygous for alt allele
+    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().2==0%(unaffecteds_exp)s%(parents_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
+    )
+    exome.selectVariants(infile, outfile, genome, select)
+
+###############################################################################
+
+
+@transform(recessiveVariants,
+           regex(r"variants/(\S+).recessive.vcf"),
+           r"variants/\1.recessive.table")
+def tabulateRecs(infile, outfile):
+    '''Tabulate potential homozygous recessive disease variants'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    columns = PARAMS["gatk_vcf_to_table"]
+    exome.vcfToTable(infile, outfile, genome, columns)
+
+###############################################################################
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(tabulateRecs, regex(r"variants/(\S+).recessive.table"),
+           r"variants/\1.recessive.table.load")
+def loadRecs(infile, outfile):
+    '''Load homozygous recessive disease candidates into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")
+
+###############################################################################
+###############################################################################
+###############################################################################
+# X-linked
+
+
+@follows(loadVariantAnnotation)
+@transform("*.ped", regex(r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
+           add_inputs(r"variants/all_samples.snpsift.vcf"),
+           r"variants/\1.xlinked.vcf")
+def xlinkedVariants(infiles, outfile):
+    '''Find maternally inherited X chromosome variants in male patients'''
+    track = P.snip(os.path.basename(outfile), ".vcf")
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    pedfile, infile = infiles
+    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
+                              fieldnames=['family', 'sample', 'father',
+                                          'mother', 'sex', 'status'])
+    affecteds = []
+    mothers = []
+    male_unaffecteds = []
+    female_unaffecteds = []
+    for row in pedigree:
+        if row['status'] == '2' and row['mother'] != '0':
+            if row['mother'] not in mothers:
+                mothers += [row['mother']]
+            affecteds += [row['sample']]
+        elif row['status'] == '1' and row['sex'] == '1':
+            male_unaffecteds += [row['sample']]
+        elif row['status'] == '1' and row['sex'] != '1' and row['sample'] not in mothers:
+            female_unaffecteds += [row['sample']]
+    if len(affecteds) == 0:
+        affecteds_exp = ''
+    else:
+        affecteds_exp = '").isHomRef()&&!vc.getGenotype("'.join(affecteds)
+#        affecteds_exp = '&&!vc.getGenotype("' + \
+#        ('").isHomRef()&&!vc.getGenotype("'.join(affecteds)) + \
+#        '").isHomRef()'
+    if len(mothers) == 0:
+        mothers_exp = ''
+    else:
+        mothers_exp = '&&vc.getGenotype("' + \
+                      ('").getPL().1==0&&vc.getGenotype("'.join(mothers)) + \
+                      '").getPL().1==0'
+    if len(male_unaffecteds) == 0:
+        male_unaffecteds_exp = ''
+    else:
+        male_unaffecteds_exp = '&&vc.getGenotype("' + \
+                               ('").isHomRef()&&vc.getGenotype("'.join(
+                                   male_unaffecteds)) + \
+                               '").isHomRef()'
+    if len(female_unaffecteds) == 0:
+        female_unaffecteds_exp = ''
+    else:
+        female_unaffecteds_exp = '&&vc.getGenotype("' + \
+                                 ('").getPL().2!=0&&vc.getGenotype("'.join(
+                                     female_unaffecteds)) + \
+            '").getPL().2!=0'
+    select = '''CHROM=="X"&&!vc.getGenotype("%(affecteds_exp)s").isHomRef()%(mothers_exp)s%(male_unaffecteds_exp)s%(female_unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
+    exome.selectVariants(infile, outfile, genome, select)
+
+###############################################################################
+
+
+@transform(xlinkedVariants,
+           regex(r"variants/(\S+).xlinked.vcf"),
+           r"variants/\1.xlinked.table")
+def tabulateXs(infile, outfile):
+    '''Tabulate potential X-linked disease variants'''
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    columns = PARAMS["gatk_vcf_to_table"]
+    exome.vcfToTable(infile, outfile, genome, columns)
+
+###############################################################################
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(tabulateXs, regex(r"variants/(\S+).xlinked.table"),
+           r"variants/\1.xlinked.table.load")
+def loadXs(infile, outfile):
+    '''Load X-linked disease candidates into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")
+
+###############################################################################
+###############################################################################
+###############################################################################
+# Compound hets
+
+
+# why does this not work from snpsift VCF file? DS
+# @transform(annotateVariantsSNPsift,
+#    regex(r"variants/(\S*Trio\S+|\S*Multiplex\S+).haplotypeCaller.snpsift.vcf"),
+@transform(annotateVariantsSNPeff,
+           regex(
+               r"variants/all_samples.snpeff.vcf"),
+           add_inputs(r"all_samples.ped", r"gatk/all_samples.list"),
+           r"variants/all_samples.phased.vcf")
+def phasing(infiles, outfile):
+    '''phase variants with GATK'''
+    infile, pedfile, bamlist = infiles
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    statement = '''GenomeAnalysisTK -T PhaseByTransmission
+                   -R %(genome)s
+                   -V %(infile)s
+                   -ped %(pedfile)s
+                   -mvf %(infile)s.mvf
+                   -o %(outfile)s ;''' % locals()
+    P.run(statement)
+
+###############################################################################
+
+
+@transform(phasing, regex(r"variants/all_samples.phased.vcf"),
+           add_inputs(r"gatk/all_samples.list"),
+           r"variants/all_samples.rbp.vcf")
+def readbackedphasing(infiles, outfile):
+    '''phase variants with ReadBackedPhasing'''
+    job_memory = "32G"
+    infile, bamlist = infiles
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4
+                   -R %(genome)s
+                   -I %(bamlist)s
+                   -V %(infile)s
+                   -o %(outfile)s; ''' % locals()
+    P.run(statement)
+
+###############################################################################
+
+
+@follows(readbackedphasing)
+@transform("*.ped",
+           regex(r"(\S*Multiplex\S+|\S*Trio\S+).ped"),
+           add_inputs(r"no_multiallelic_all_samples.rbp.vcf",
+                      r"all_samples.ped"),
+           r"variants/\1.compound_hets.table")
+def compoundHets(infiles, outfile):
+    '''Identify potentially pathogenic compound heterozygous variants
+    (phasing with GATK followed by compound het calling using Gemini'''
+    family, infile, pedfile = infiles
+    family_id = P.snip(os.path.basename(family), ".ped")
+    statement = '''gemini load -v %(infile)s
+                   -p %(pedfile)s -t snpEff %(family_id)s.db &&
+                   gemini comp_hets
+                   --families %(family_id)s
+                   --columns "chrom, start, end, ref, alt, codon_change, gene, qual, depth"
+                   --filter
+                   "(impact_severity = 'HIGH' OR impact_severity = 'MED')
+                   AND (in_esp = 0 OR aaf_esp_all < 0.01)
+                   AND (in_1kg = 0 OR aaf_1kg_all < 0.01)"
+                   %(family_id)s.db > %(outfile)s;'''
+    # rm -f %(family_id)s.db'''
+    P.run(statement)
+
+###############################################################################
+
+
+@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
+@transform(compoundHets, regex(r"variants/(\S+).compound_hets.table"),
+           r"variants/\1.compound_hets.table.load")
+def loadCompoundHets(infile, outfile):
+    '''Load compound heterozygous variants into database'''
+    P.load(infile, outfile,
+           options="--retry --ignore-empty --allow-empty-file")

From 0769f35e4cf3d4342630dad158e4a185f0f55fec Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Mon, 16 May 2022 11:14:12 +0100
Subject: [PATCH 30/43] remove pedigree pipeline

---
 .../tools/pipeline_exome_pedigree.py          | 491 ------------------
 1 file changed, 491 deletions(-)
 delete mode 100644 cgatpipelines/tools/pipeline_exome_pedigree.py

diff --git a/cgatpipelines/tools/pipeline_exome_pedigree.py b/cgatpipelines/tools/pipeline_exome_pedigree.py
deleted file mode 100644
index 0752af118..000000000
--- a/cgatpipelines/tools/pipeline_exome_pedigree.py
+++ /dev/null
@@ -1,491 +0,0 @@
-
-'''
-PEDIGREE CODE FROM PIPELINE EXOME
-REMOVED FOR CLARITY
-'''
-
-
-##############################################################################
-###############################################################################
-###############################################################################
-# Confirm parentage (do novo trios only)
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.parentage")
-def confirmParentage(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t', fieldnames=[
-        'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    trio = P.snip(os.path.basename(pedfile), ".ped")
-    trio = trio.replace(".", "_").replace("-", "_")
-    database = PARAMS["database_name"]
-    proband = None
-    mother = None
-    father = None
-    for row in pedigree:
-        if row['status'] == '2':
-            proband = row['sample'].replace(".", "_").replace("-", "_")
-            mother = row['mother'].replace(".", "_").replace("-", "_")
-            father = row['father'].replace(".", "_").replace("-", "_")
-    E.info("proband:" + proband)
-    E.info("mother:" + mother)
-    E.info("father:" + father)
-
-    query = '''SELECT '%(trio)s' as family,
-               hom_nonref_trio/(hom_nonref_parents+0.0) as hom_nonref_conc,
-               child_mat_het_nonref/(maternal_hom_nonref+0.0) as maternal_conc,
-               child_pat_het_nonref/(paternal_hom_nonref+0.0) as paternal_conc,
-                *
-               FROM
-               (SELECT count(*) as hom_nonref_trio
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as hom_nonref_parents
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as maternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_mat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND %(mother)s_PL LIKE '%%%%,0'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND  %(father)s_PL LIKE '0,%%%%'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as paternal_hom_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND cast(%(proband)s_DP as INTEGER) > 10),
-               (SELECT count(*) as child_pat_het_nonref
-               FROM %(trio)s_genes_table
-               where  CHROM NOT IN ('X', 'Y')
-               AND  %(father)s_PL LIKE '%%%%,0'
-               AND cast(%(father)s_DP as INTEGER) > 10
-               AND  %(mother)s_PL LIKE '0,%%%%'
-               AND cast(%(mother)s_DP as INTEGER) > 10
-               AND %(proband)s_PL LIKE '%%%%,0,%%%%'
-               AND cast(%(proband)s_DP as INTEGER) > 10)''' % locals()
-    statement = '''sqlite3 %(database)s "%(query)s" > %(outfile)s
-                   2> %(outfile)s.log''' % locals()
-    P.run(statement)
-
-###############################################################################
-###############################################################################
-###############################################################################
-# De novos
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.filtered.vcf")
-def deNovoVariants(infiles, outfile):
-    '''Filter de novo variants based on provided jexl expression'''
-    job_memory = GATK_MEMORY
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3&&((vc.getGenotype("%(child)s").getAD().1)/(vc.getGenotype("%(child)s").getDP().floatValue()))>=0.25&&(vc.getGenotype("%(father)s").getAD().1==0||(vc.getGenotype("%(father)s").getAD().1>0&&((vc.getGenotype("%(father)s").getAD().1)/(vc.getGenotype("%(father)s").getDP().floatValue()))<0.05))&&(vc.getGenotype("%(mother)s").getAD().1==0||(vc.getGenotype("%(mother)s").getAD().1>0&&((vc.getGenotype("%(mother)s").getAD().1)/(vc.getGenotype("%(mother)s").getDP().floatValue()))<0.05))&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(deNovoVariants,
-           regex(r"variants/(\S+).filtered.vcf"),
-           r"variants/\1.filtered.table")
-def tabulateDeNovos(infile, outfile):
-    '''Tabulate de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns, GATK_MEMORY)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDeNovos,
-           regex(r"variants/(\S+).filtered.table"),
-           r"variants/\1.filtered.table.load")
-def loadDeNovos(infile, outfile):
-    '''Load de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-
-
-@follows(loadVariantAnnotation)
-@transform("*Trio*.ped",
-           regex(r"(\S*Trio\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.denovos.vcf")
-def lowerStringencyDeNovos(infiles, outfile):
-    '''Filter lower stringency de novo variants based on provided jexl
-    expression'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(
-        iotools.open_file(pedfile), delimiter='\t', fieldnames=[
-            'family', 'sample', 'father', 'mother', 'sex', 'status'])
-    for row in pedigree:
-        if row['status'] == '2':
-            father = row['father']
-            mother = row['mother']
-            child = row['sample']
-    select = '''vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-
-@transform(lowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.vcf"),
-           r"variants/\1.denovos.table")
-def tabulateLowerStringencyDeNovos(infile, outfile):
-    '''Tabulate lower stringency de novo variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateLowerStringencyDeNovos,
-           regex(r"variants/(\S+).denovos.table"),
-           r"variants/\1.denovos.table.load")
-def loadLowerStringencyDeNovos(infile, outfile):
-    '''Load lower stringency de novos into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Dominant
-
-
-@follows(loadVariantAnnotation)
-@transform("*Multiplex*.ped",
-           regex(r"(\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.dominant.vcf")
-def dominantVariants(infiles, outfile):
-    '''Filter variants according to autosomal dominant disease model'''
-    pedfile, infile = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-        if row['status'] == '1':
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().1==0&&vc.getGenotype("'.join(affecteds)
-    affecteds_exp2 = '").getAD().1>=1&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").isHomRef()&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").isHomRef()'
-    # for some weird reason the 1000G filter doesn't work on these particular
-    # files - will add later when I've figured out what's wrong
-    # currently 1000G filter is performed at the report stage (not in csvdb)
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().1==0&&vc.getGenotype("%(affecteds_exp2)s").getAD().1>=1%(unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(dominantVariants,
-           regex(r"variants/(\S+).dominant.vcf"),
-           r"variants/\1.dominant.table")
-def tabulateDoms(infile, outfile):
-    '''Tabulate dominant disease candidate variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateDoms, regex(r"variants/(\S+).dominant.table"),
-           r"variants/\1.dominant.table.load")
-def loadDoms(infile, outfile):
-    '''Load dominant disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Recessive
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped",
-           regex(
-               r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.recessive.vcf")
-def recessiveVariants(infiles, outfile):
-    '''Filter variants according to autosomal recessive disease model'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    parents = []
-    unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2':
-            affecteds += [row['sample']]
-            if row['father'] != '0':
-                parents += [row['father']]
-            if row['mother'] != '0':
-                parents += [row['mother']]
-        elif row['status'] == '1' and row['sample'] not in parents:
-            unaffecteds += [row['sample']]
-    affecteds_exp = '").getPL().2==0&&vc.getGenotype("'.join(affecteds)
-    if len(unaffecteds) == 0:
-        unaffecteds_exp = ''
-    else:
-        unaffecteds_exp = '&&vc.getGenotype("' + \
-            ('").getPL().2!=0&&vc.getGenotype("'.join(unaffecteds)) + \
-            '").getPL().2!=0'
-    if len(parents) == 0:
-        parents_exp = ''
-    else:
-        parents_exp = '&&vc.getGenotype("' + \
-            ('").getPL().1==0&&vc.getGenotype("'.join(parents)) + \
-            '").getPL().1==0'
-    # need a way of specifying that other unaffecteds eg. sibs can't be
-    # homozygous for alt allele
-    select = '''vc.getGenotype("%(affecteds_exp)s").getPL().2==0%(unaffecteds_exp)s%(parents_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals(
-    )
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(recessiveVariants,
-           regex(r"variants/(\S+).recessive.vcf"),
-           r"variants/\1.recessive.table")
-def tabulateRecs(infile, outfile):
-    '''Tabulate potential homozygous recessive disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateRecs, regex(r"variants/(\S+).recessive.table"),
-           r"variants/\1.recessive.table.load")
-def loadRecs(infile, outfile):
-    '''Load homozygous recessive disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# X-linked
-
-
-@follows(loadVariantAnnotation)
-@transform("*.ped", regex(r"(\S*Trio\S+|\S*Multiplex\S+).ped"),
-           add_inputs(r"variants/all_samples.snpsift.vcf"),
-           r"variants/\1.xlinked.vcf")
-def xlinkedVariants(infiles, outfile):
-    '''Find maternally inherited X chromosome variants in male patients'''
-    track = P.snip(os.path.basename(outfile), ".vcf")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    pedfile, infile = infiles
-    pedigree = csv.DictReader(open(pedfile), delimiter='\t',
-                              fieldnames=['family', 'sample', 'father',
-                                          'mother', 'sex', 'status'])
-    affecteds = []
-    mothers = []
-    male_unaffecteds = []
-    female_unaffecteds = []
-    for row in pedigree:
-        if row['status'] == '2' and row['mother'] != '0':
-            if row['mother'] not in mothers:
-                mothers += [row['mother']]
-            affecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] == '1':
-            male_unaffecteds += [row['sample']]
-        elif row['status'] == '1' and row['sex'] != '1' and row['sample'] not in mothers:
-            female_unaffecteds += [row['sample']]
-    if len(affecteds) == 0:
-        affecteds_exp = ''
-    else:
-        affecteds_exp = '").isHomRef()&&!vc.getGenotype("'.join(affecteds)
-#        affecteds_exp = '&&!vc.getGenotype("' + \
-#        ('").isHomRef()&&!vc.getGenotype("'.join(affecteds)) + \
-#        '").isHomRef()'
-    if len(mothers) == 0:
-        mothers_exp = ''
-    else:
-        mothers_exp = '&&vc.getGenotype("' + \
-                      ('").getPL().1==0&&vc.getGenotype("'.join(mothers)) + \
-                      '").getPL().1==0'
-    if len(male_unaffecteds) == 0:
-        male_unaffecteds_exp = ''
-    else:
-        male_unaffecteds_exp = '&&vc.getGenotype("' + \
-                               ('").isHomRef()&&vc.getGenotype("'.join(
-                                   male_unaffecteds)) + \
-                               '").isHomRef()'
-    if len(female_unaffecteds) == 0:
-        female_unaffecteds_exp = ''
-    else:
-        female_unaffecteds_exp = '&&vc.getGenotype("' + \
-                                 ('").getPL().2!=0&&vc.getGenotype("'.join(
-                                     female_unaffecteds)) + \
-            '").getPL().2!=0'
-    select = '''CHROM=="X"&&!vc.getGenotype("%(affecteds_exp)s").isHomRef()%(mothers_exp)s%(male_unaffecteds_exp)s%(female_unaffecteds_exp)s&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")''' % locals()
-    exome.selectVariants(infile, outfile, genome, select)
-
-###############################################################################
-
-
-@transform(xlinkedVariants,
-           regex(r"variants/(\S+).xlinked.vcf"),
-           r"variants/\1.xlinked.table")
-def tabulateXs(infile, outfile):
-    '''Tabulate potential X-linked disease variants'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["gatk_vcf_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(tabulateXs, regex(r"variants/(\S+).xlinked.table"),
-           r"variants/\1.xlinked.table.load")
-def loadXs(infile, outfile):
-    '''Load X-linked disease candidates into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")
-
-###############################################################################
-###############################################################################
-###############################################################################
-# Compound hets
-
-
-# why does this not work from snpsift VCF file? DS
-# @transform(annotateVariantsSNPsift,
-#    regex(r"variants/(\S*Trio\S+|\S*Multiplex\S+).haplotypeCaller.snpsift.vcf"),
-@transform(annotateVariantsSNPeff,
-           regex(
-               r"variants/all_samples.snpeff.vcf"),
-           add_inputs(r"all_samples.ped", r"gatk/all_samples.list"),
-           r"variants/all_samples.phased.vcf")
-def phasing(infiles, outfile):
-    '''phase variants with GATK'''
-    infile, pedfile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T PhaseByTransmission
-                   -R %(genome)s
-                   -V %(infile)s
-                   -ped %(pedfile)s
-                   -mvf %(infile)s.mvf
-                   -o %(outfile)s ;''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@transform(phasing, regex(r"variants/all_samples.phased.vcf"),
-           add_inputs(r"gatk/all_samples.list"),
-           r"variants/all_samples.rbp.vcf")
-def readbackedphasing(infiles, outfile):
-    '''phase variants with ReadBackedPhasing'''
-    job_memory = "32G"
-    infile, bamlist = infiles
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    statement = '''GenomeAnalysisTK -T ReadBackedPhasing -nt 4
-                   -R %(genome)s
-                   -I %(bamlist)s
-                   -V %(infile)s
-                   -o %(outfile)s; ''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@follows(readbackedphasing)
-@transform("*.ped",
-           regex(r"(\S*Multiplex\S+|\S*Trio\S+).ped"),
-           add_inputs(r"no_multiallelic_all_samples.rbp.vcf",
-                      r"all_samples.ped"),
-           r"variants/\1.compound_hets.table")
-def compoundHets(infiles, outfile):
-    '''Identify potentially pathogenic compound heterozygous variants
-    (phasing with GATK followed by compound het calling using Gemini'''
-    family, infile, pedfile = infiles
-    family_id = P.snip(os.path.basename(family), ".ped")
-    statement = '''gemini load -v %(infile)s
-                   -p %(pedfile)s -t snpEff %(family_id)s.db &&
-                   gemini comp_hets
-                   --families %(family_id)s
-                   --columns "chrom, start, end, ref, alt, codon_change, gene, qual, depth"
-                   --filter
-                   "(impact_severity = 'HIGH' OR impact_severity = 'MED')
-                   AND (in_esp = 0 OR aaf_esp_all < 0.01)
-                   AND (in_1kg = 0 OR aaf_1kg_all < 0.01)"
-                   %(family_id)s.db > %(outfile)s;'''
-    # rm -f %(family_id)s.db'''
-    P.run(statement)
-
-###############################################################################
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(compoundHets, regex(r"variants/(\S+).compound_hets.table"),
-           r"variants/\1.compound_hets.table.load")
-def loadCompoundHets(infile, outfile):
-    '''Load compound heterozygous variants into database'''
-    P.load(infile, outfile,
-           options="--retry --ignore-empty --allow-empty-file")

From d6685f329ea17672acbc88fcc8dfc1b7562f9f63 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Fri, 20 May 2022 15:44:25 +0100
Subject: [PATCH 31/43] functions to establish gene 'Panel' BED from list
 further cleanup further rearrangement VEP output now txt

---
 cgatpipelines/tasks/exome.py                  | 311 +----------
 cgatpipelines/tools/pipeline_exome.py         | 508 +++++-------------
 .../tools/pipeline_exome/pipeline.yml         | 170 +++---
 3 files changed, 207 insertions(+), 782 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index a594af598..edee873fc 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -76,7 +76,7 @@ def GATKReadGroups(infile, outfile, dictionary,
 
 
 def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
-                  solid_options="", gatkmem="2G"):
+                  options="", gatkmem="2G"):
     '''Recalibrates base quality scores using GATK'''
 
     track = P.snip(os.path.basename(infile), ".bam")
@@ -91,7 +91,7 @@ def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp,
                     -L %(intervals)s
                     -ip %(padding)s
                     -I %(infile)s
-                    --known-sites %(dbsnp)s %(solid_options)s ;
+                    --known-sites %(dbsnp)s %(options)s ;
                     ''' % locals()
 
     statement += '''gatk
@@ -896,313 +896,6 @@ def filterQuality(infile, qualstr, qualfilter, outfiles):
     out2.close()
 
 
-@cluster_runnable
-def FilterExacCols(infile, exac_suffs, exac_thresh):
-    '''
-    Returns a set of line indices indicating lines where either of the alleles
-    called have a frequency of greater that exac_thresh in any of the
-    populations specified as exac_suffs.
-    Where no data is available an allele frequency of -1 is used.
-
-    Exac provide data as AC_xxx and AN_xxx where AC is the allele count
-    - the number of times the allele has been called
-    - and AN is chromosome count - the number of
-    samples in which the allele could have been called - in population xxx.
-    AC / AN = allele frequecy.
-
-    exac_suffs are any columns where an AC_xxx and AN_xxx column is provided
-    in the VCF, e.g. Adj will calculate allele frequency from the AC_Adj
-    and AN_Adj columns
-
-    '''
-    # read columns from the input VCF
-    exac_suffs = exac_suffs.split(",")
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-    nD = dict()
-    afdict = dict()
-    for e in exac_suffs:
-        # find the columns with the appropriate information
-        # Allele count
-        AC_i = cols.index("AC_%s" % (e))
-        # Allele Number
-        AN_i = cols.index("AN_%s" % (e))
-        # Genotype
-        GT_i = cols.index('GT')
-        nlist = set()
-        n = 0
-        AFS = []
-        with iotools.open_file(infile) as input:
-            for line in input:
-                if n > 1:
-                    line = line.strip().split("\t")
-                    # At multi-allelic sites, comma delimited AC and AN values
-                    # are provided
-                    # "." and "NA" indicate no data here - this is represented
-                    # as an AF of -1
-                    AC = line[AC_i].replace(".", "-1").replace(
-                        "NA", "-1").split(",")
-                    AN = line[AN_i].replace(".", "1").replace(
-                        "NA", "1").split(",")
-                    AC = np.array([float(a) for a in AC])
-                    AN = np.array([float(a) for a in AN])
-                    AF = AC / AN
-                    AF2 = [af if af > 0 else 0 for af in AF]
-                    AF = np.insert(AF, 0, (1 - sum(AF2)))
-
-                    # Chromosome count is usually the same for all minor
-                    # alleles (but not always)
-                    # If it is not the same the AC and AN lists should be the
-                    # same length
-                    # Otherwise AN will have length 1
-                    if len(AC) != len(AN):
-                        AN = [AN] * len(AC)
-
-                    # Record the genotype called in this sample for this SNP
-                    GT = line[GT_i]
-                    GT = GT.replace(".", '0')
-                    GT = GT.split("/")
-                    GT[0], GT[1] = int(GT[0]), int(GT[1])
-
-                    # If the variant is not in ExAC the ExAC columns show "."
-                    # but the site
-                    # may still have been called as multi allelic
-                    # - use -1 for all frequencies
-                    # in this case
-                    if max(GT) > (len(AF) - 1):
-                        AF = np.array([-1] * (max(GT) + 1))
-
-                    AF1 = AF[GT[0]]
-                    AF2 = AF[GT[1]]
-                    AFS.append((AF1, AF2))
-                    # Remember where both allele frequencies are
-                    # greater than exac_thresh
-                    if AF1 >= exac_thresh and AF2 >= exac_thresh:
-                        nlist.add(n)
-                else:
-                    AFS.append(('NA', 'NA'))
-                n += 1
-        afdict[e] = AFS
-        nD[e] = nlist
-
-    ns = set.union(*list(nD.values()))
-    return afdict, ns
-
-
-@cluster_runnable
-def FilterFreqCols(infile, thresh, fcols):
-    '''
-    Returns a set of line indices indicating lines where either of the alleles
-    called have a frequency of less than thresh in all of the columns specified
-    in fcols.
-    No information - assigned allele frequency of -1.
-    '''
-    fcols = fcols.split(",")
-    # read the column headings from the variant table
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-    # store allele frequency columns
-    AFdict = dict()
-    # store low frequency indices
-    nD = dict()
-    for col in fcols:
-        ind = cols.index(col)
-        GT_i = cols.index('GT')
-        n = 0
-        nlist = set()
-        AFS = []
-        with iotools.open_file(infile) as input:
-            for line in input:
-                if n > 1:
-                    line = line.strip().split("\t")
-                    GT = line[GT_i].replace(".", "0").split("/")
-                    af = line[ind].split(",")
-                    AF = []
-                    # where the allele frequency is not numeric
-                    # "." or "NA" use -1 to indicate no data
-                    for a in af:
-                        try:
-                            AF.append(float(a))
-                        except:
-                            AF.append(float(-1))
-                    AF2 = [l if l > 0 else 0 for l in AF]
-                    AF = np.array(AF)
-                    AF = np.insert(AF, 0, 1 - sum(AF2))
-                    GT[0] = int(GT[0])
-                    GT[1] = int(GT[1])
-                    # If the variant is not in database the column shows "."
-                    # but the site
-                    # may still have been called as multi allelic
-                    # - use -1 for all frequencies
-                    # in this case
-                    if max(GT[0], GT[1]) > (len(AF) - 1):
-                        AF = [float(-1)] * (max(GT[0], GT[1]) + 1)
-                    AF1 = AF[GT[0]]
-                    AF2 = AF[GT[1]]
-                    if AF1 >= thresh and AF2 >= thresh:
-                        nlist.add(n)
-                    AFS.append((AF1, AF2))
-                else:
-                    AFS.append(('NA', 'NA'))
-                n += 1
-        AFdict[col] = AFS
-        nD[col] = nlist
-
-    ns = set.union(*list(nD.values()))
-    return AFdict, ns
-
-
-def WriteFreqFiltered(infile, exacdict, exacinds, otherdict, otherinds,
-                      outfiles):
-    '''
-    Writes the output of the frequency filtering steps to file, including
-    the new columns showing calculated allele frequency for the allele
-    in this specific sample.
-    '''
-    x = 0
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-
-    exaccols = list(exacdict.keys())
-    othercols = list(otherdict.keys())
-
-    # column names for the new columns
-    exacnewcols = ["%s_calc" % c for c in exaccols]
-    othernewcols = ["%s_calc" % c for c in othercols]
-
-    with iotools.open_file(infile) as infile:
-        for line in infile:
-            line = line.strip()
-            if x <= 1:
-                # write the column names
-                out.write("%s\t%s\t%s\n" % (line, "\t".join(exacnewcols),
-                                            "\t".join(othernewcols)))
-            else:
-                freqs = []
-                for key in exaccols:
-                    col = exacdict[key]
-                    freqs.append(col[x])
-                for key in othercols:
-                    col = otherdict[key]
-                    freqs.append(col[x])
-                freqs = [str(freq) for freq in freqs]
-
-                if x not in exacinds and x not in otherinds:
-                    out.write("%s\t%s\n" % (line, "\t".join(freqs)))
-                else:
-                    out2.write("%s\t%s\n" % (line, "\t".join(freqs)))
-            x += 1
-    out.close()
-    out2.close()
-
-
-@cluster_runnable
-def filterRarity(infile, exac, freqs, thresh, outfiles):
-    '''
-    Filter out variants which are common in any of the exac or other
-    population datasets as specified in the pipeline.yml.
-    '''
-    exacdict, exacinds = FilterExacCols(infile, exac, thresh)
-    otherdict, otherinds = FilterFreqCols(infile, thresh, freqs)
-    WriteFreqFiltered(infile, exacdict, exacinds, otherdict,
-                      otherinds, outfiles)
-
-
-@cluster_runnable
-def filterDamage(infile, damagestr, outfiles):
-    '''
-    Filter variants which have not been assessed as damaging by any
-    of the specified tools.
-    Tools and thresholds can be specified in the pipeline.yml.
-
-    Does not account for multiple alt alleles - if any ALT allele has
-    been assessed as damaging with any tool the variant is kept,
-    regardless of if this is the allele called in the sample.
-
-    '''
-    damaging = damagestr.split(",")
-    cols = iotools.open_file(infile).readline().strip().split("\t")
-
-    D = dict()
-    # parses the "damage string" from the pipeline.yml
-    # this should be formatted as COLUMN|result1-result2-...,COLUMN|result1...
-    # where variants with any of these results in this column will
-    # be retained
-    for d in damaging:
-        d = d.split("|")
-        col = d[0]
-        res = d[1].split("-")
-        i = cols.index(col)
-        D[col] = ((res, i))
-
-    x = 0
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-    with iotools.open_file(infile) as input:
-        for line in input:
-            if x > 1:
-                # grep for specific strings within this column of this
-                # line of the input file
-                line = line.strip().split("\t")
-                isdamaging = 0
-                for key in D:
-                    res, i = D[key]
-                    current = line[i]
-                    for r in res:
-                        if re.search(r, current):
-                            isdamaging = 1
-                if isdamaging == 1:
-                    out.write("%s\n" % "\t".join(line))
-                else:
-                    out2.write("%s\n" % "\t".join(line))
-            else:
-                out.write(line)
-            x += 1
-    out.close()
-    out2.close()
-
-
-@cluster_runnable
-def filterFamily(infile, infile2, outfiles):
-    '''
-    Filter variants according to the output of calculateFamily -
-    only variants shared by both members of a family will be kept.
-    '''
-    cps1 = set()
-    cps2 = set()
-
-    # make a list of variants in infile1
-    with iotools.open_file(infile) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            chrom = line[0]
-            pos = line[1]
-            cp = "%s_%s" % (chrom, pos)
-            cps1.add(cp)
-
-    # make a list of variants in infile2
-    with iotools.open_file(infile2) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            chrom = line[0]
-            pos = line[1]
-            cp = "%s_%s" % (chrom, pos)
-            cps2.add(cp)
-
-    # only variants in both are of interest
-    cps = cps1 & cps2
-
-    out = iotools.open_file(outfiles[0], "w")
-    out2 = iotools.open_file(outfiles[1], "w")
-    with iotools.open_file(infile) as input:
-        for line in input:
-            line = line.strip().split("\t")
-            if "%s_%s" % (line[0], line[1]) in cps:
-                out.write("%s\n" % "\t".join(line))
-            else:
-                out2.write("%s\n" % "\t".join(line))
-    out.close()
-    out2.close()
-
-
 @cluster_runnable
 def CleanVariantTables(genes, variants, cols, outfile):
     variants = pd.read_csv(variants, sep="\t")
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index 27164b985..cb79d5137 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -163,7 +163,7 @@
 # load modules
 from ruffus import transform, mkdir, follows, merge, regex, suffix, \
     jobs_limit, files, collate, add_inputs, formatter, \
-    active_if, originate
+    active_if, originate, subdivide
 from ruffus.combinatorics import permutations
 import sys
 import os
@@ -199,6 +199,8 @@
 GATK_MEMORY = PARAMS["gatk_memory"]
 ANNOTATION_MEMORY = PARAMS["annotation_memory"]
 
+PANELS = glob.glob("*.panel.tsv")
+
 
 ###############################################################################
 ###############################################################################
@@ -259,7 +261,41 @@ def createSequenceDictionary(outfile):
                         -R %(genome_file)s -O %(outfile)s'''
     P.run(statement)
 
+@originate("ensembl.dict")
+def createEnsemblDictionary(outfile):
+    '''creates a dictionary converting ensembl gene_id
+    to symbol/gene_name'''
+    geneset = PARAMS['geneset']
+
+    statement = '''zcat %(geneset)s|
+    cgat gtf2tsv --attributes-as-columns|grep -v ^#|
+    awk -F '\\t' '{print $9, $16}'|uniq > %(outfile)s'''
+
+    P.run(statement)
+
+@mkdir("panels")
+@subdivide([x for x in PANELS],
+           regex("(\S+).panel.tsv"),
+           add_inputs(createEnsemblDictionary),
+           r"\1.bed")
+def createPanelBed(infiles, outfile):
+
+    geneset = PARAMS['geneset']
+    panel_path,dict_path = infiles
+    ensembl_dict = pd.read_csv(dict_path, sep=" ")
+    with open(panel_path) as f:
+        panellist = [line.strip() for line in f]
+    panel = ensembl_dict[ensembl_dict["gene_name"].isin(panellist)]
+    panelgenes = " -e ".join(panel['gene_id'].to_list())
+    
 
+    statement = '''zcat  %(geneset)s
+    |grep -e %(panelgenes)s|cgat gtf2gtf --method=sort|
+    cgat gff2bed --is-gtf --log %(outfile)s.log | sort -k1,1 -k2,2n|
+    cgat bed2bed --method=merge --merge-by-name 
+    --log %(outfile)s.log > %(outfile)s'''
+
+    P.run(statement)
 
 ###############################################################################
 ###############################################################################
@@ -342,7 +378,7 @@ def GATKBaseRecal(infile, outfile):
     intrack = P.snip(os.path.basename(infile), ".bam")
     outtrack = P.snip(os.path.basename(outfile), ".bam")
     dbsnp = PARAMS["gatk_dbsnp"]
-    solid_options = PARAMS["gatk_solid_options"]
+    options = PARAMS["gatk_baserecalibrator_options"]
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
     intervals = PARAMS["roi_intervals"]
     padding = PARAMS["roi_padding"]
@@ -351,7 +387,7 @@ def GATKBaseRecal(infile, outfile):
         shutil.copyfile(intrack + ".bai", outtrack + ".bai")
     else:
         exome.GATKBaseRecal(infile, outfile, genome, intervals,
-                                    padding, dbsnp, solid_options,
+                                    padding, dbsnp, options,
                                     GATK_MEMORY)
     iotools.zap_file(infile)
 
@@ -496,6 +532,9 @@ def filterExcessHets(infile, outfile):
     P.run(statement)
 
 
+###############################################################################
+# SNP Recalibration
+
 @transform(filterExcessHets,
            regex(r"variants/all_samples_excesshet.vcf.gz"),
            r"variants/all_samples_sitesonly.vcf.gz")
@@ -511,10 +550,6 @@ def sitesOnlyVcf(infile,outfile):
     P.run(statement)
 
 
-
-###############################################################################
-# SNP Recalibration
-
 @transform(sitesOnlyVcf,
            regex(r"variants/all_samples_sitesonly.vcf.gz"),
            r"variants/all_samples.snp_vqsr.recal")
@@ -578,6 +613,44 @@ def applyVQSRIndels(infiles, outfile):
     exome.applyVQSR(vcf, recal, tranches,
                                             outfile, genome, mode, GATK_MEMORY)
 
+###############################################################################
+# coverage over candidate genes
+
+@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
+def listOfBAMs(infiles, outfile):
+    '''generates a file containing a list of BAMs'''
+    with iotools.open_file(outfile, "w") as outf:
+        for infile in infiles:
+            outf.write(infile + '\n')
+
+@active_if(PARAMS["coverage_calculate"] == 1)
+@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary")
+def candidateCoverage(infile, outfile):
+    '''Calculate coverage over exons of candidate genes'''
+    all_exons = PARAMS["coverage_all_exons"]
+    candidates = PARAMS["coverage_candidates"]
+    candidates = candidates.replace(",", " -e ")
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    threshold = PARAMS["coverage_threshold"]
+    statement = '''zcat %(all_exons)s | grep -e %(candidates)s
+                   | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - >
+                   candidate.interval_list ; ''' % locals()
+    statement += '''zcat %(all_exons)s | grep -e %(candidates)s
+                    | awk '{print $16}' - | sed 's/"//g;s/;//g' - >
+                    candidate_gene_names.txt ;''' % locals()
+    statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s
+                    -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals()
+    P.run(statement)
+
+@active_if(PARAMS["coverage_calculate"] == 1)
+@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf")
+def candidateCoveragePlots(infile, outfile):
+    '''Produce plots of coverage'''
+    rscript = PARAMS["coverage_rscript"]
+    threshold = PARAMS["coverage_threshold"]
+    statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;'''
+    P.run(statement)
+
 
 
 ###############################################################################
@@ -613,7 +686,7 @@ def extractSomalier(infile,outfile):
            regex(r"variants/somalier/somalier.log"),
            r"variants/somalier/relate.log")
 def relateSomalier(infile,outfile):
-    '''Calculate Ancestry based on 1000G project'''
+    '''Calculate relatedness based on 1000G project'''
 
     somalier = PARAMS['annotation_somalier_path']
     outdir = os.path.dirname(outfile)
@@ -640,182 +713,13 @@ def ancestrySomalier(infile,outfile):
     P.run(statement)
 
 
+
 ###############################################################################
-# SnpSift
+# VEP
 
 @transform(applyVQSRIndels,
            regex(r"variants/all_samples.vqsr.vcf.gz"),
-           r"variants/all_samples.snpeff.vcf.gz")
-def annotateVariantsSNPeff(infile, outfile):
-    '''Annotate variants using SNPeff'''
-    job_memory = ANNOTATION_MEMORY
-    job_threads = PARAMS["annotation_threads"]
-    snpeff_genome = PARAMS["annotation_snpeff_genome"]
-    config = PARAMS["annotation_snpeff_config"]
-    outfile = P.snip(outfile,".gz")
-    statement = ''' snpEff -Xmx%(job_memory)s 
-    -c %(config)s 
-    -v %(snpeff_genome)s 
-    %(infile)s > %(outfile)s 2> %(outfile)s.log;
-    bgzip %(outfile)s;
-    tabix -p vcf %(outfile)s.gz'''  % locals()
-    P.run(statement, job_memory=PARAMS["annotation_memory"])
-
-
-@transform(annotateVariantsSNPeff,
-           regex(r"variants/all_samples.snpeff.vcf.gz"),
-           r"variants/all_samples.snpeff.table")
-def vcfToTableSnpEff(infile, outfile):
-    '''Converts vcf to tab-delimited file'''
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    columns = PARAMS["annotation_snpeff_to_table"]
-    exome.vcfToTable(infile, outfile, genome, columns, ANNOTATION_MEMORY)
-
-
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@transform(vcfToTableSnpEff, regex(r"variants/(\S+).table"),
-           r"variants/\1.table.load")
-def loadTableSnpEff(infile, outfile):
-    '''Load VCF annotations into database'''
-    P.load(infile, outfile, options="--retry --ignore-empty")
-
-###############################################################################
-# SnpSift
-
-@transform(annotateVariantsSNPeff,
-           regex(r"variants/all_samples.snpeff.vcf.gz"),
-           r"variants/all_samples.snpsift_1.vcf.gz")
-def annotateVariantsDBNSFP(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_memory = ANNOTATION_MEMORY
-    job_threads = PARAMS["annotation_threads"]
-    dbNSFP = PARAMS["annotation_dbnsfp"]
-    dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"]
-    outfile = P.snip(outfile,".gz")
-    if not dbNSFP_columns:
-        annostring = ""
-    else:
-        annostring = "-f %s" % dbNSFP_columns
-
-    statement = '''SnpSift dbnsfp
-    -Xmx%(job_memory)s
-    -db %(dbNSFP)s -v %(infile)s
-    %(annostring)s >
-    %(outfile)s 2> %(outfile)s.log;
-    bgzip %(outfile)s;
-    tabix -p vcf %(outfile)s.gz'''
-    P.run(statement)
-
-
-@transform(annotateVariantsDBNSFP,
-           regex(r"variants/all_samples.snpsift_1.vcf.gz"),
-           r"variants/all_samples.snpsift_2.vcf.gz")
-def annotateVariantsClinvar(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_threads = PARAMS["annotation_threads"]
-    clinvar = PARAMS["annotation_clinvar"]
-    outfile = P.snip(outfile,".gz")
-    exome.snpSift(infile,outfile,clinvar, memory=ANNOTATION_MEMORY)
-
-
-@transform(annotateVariantsClinvar,
-           regex(r"variants/all_samples.snpsift_2.vcf.gz"),
-           r"variants/all_samples.snpsift_3.vcf.gz")
-def annotateVariantsGWASC(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_memory = ANNOTATION_MEMORY
-    job_threads = PARAMS["annotation_threads"]
-    gwas_catalog = PARAMS["annotation_gwas_catalog"]
-    outfile = P.snip(outfile,".gz")
-
-    statement = """SnpSift gwasCat
-    -Xmx%(job_memory)s 
-    -db %(gwas_catalog)s
-    %(infile)s > %(outfile)s  2> %(outfile)s.log;
-    bgzip %(outfile)s;
-    tabix -p vcf %(outfile)s.gz;"""
-    P.run(statement)
-
-
-@transform(annotateVariantsGWASC,
-           regex(r"variants/all_samples.snpsift_3.vcf.gz"),
-           r"variants/all_samples.snpsift_4.vcf.gz")
-def annotateVariantsPhastcons(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_memory = ANNOTATION_MEMORY
-    job_threads = PARAMS["annotation_threads"]
-    genome_index = "%s/%s.fa.fai" % (
-        PARAMS['genome_dir'],
-        PARAMS['genome'])
-    phastcons = PARAMS["annotation_phastcons"]
-    outfile = P.snip(outfile,".gz")
-
-    statement = """ln -sf %(genome_index)s %(phastcons)s/genome.fai &&
-    SnpSift phastCons 
-    -Xmx%(job_memory)s 
-    %(phastcons)s %(infile)s >
-    %(outfile)s  2> %(outfile)s.log;
-    bgzip %(outfile)s;
-    tabix -p vcf %(outfile)s.gz;"""
-    P.run(statement)
-
-
-@transform(annotateVariantsPhastcons,
-           regex(r"variants/all_samples.snpsift_4.vcf.gz"),
-           r"variants/all_samples.snpsift_5.vcf.gz")
-def annotateVariants1000G(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_memory = ANNOTATION_MEMORY
-    job_threads = PARAMS["annotation_threads"]
-    outfile = P.snip(outfile,".gz")
-
-    vcfs = []
-    for f in os.listdir(PARAMS["annotation_tgdir"]):
-        if f.endswith(".vcf.gz"):
-            vcfs.append("%s/%s" % (PARAMS['annotation_tgdir'], f))
-
-    tempin = P.get_temp_filename(".")
-    statement='''cp %(infile)s %(tempin)s.gz &&
-    tabix -p vcf %(tempin)s.gz'''
-    P.run(statement)
-
-    tempout = P.get_temp_filename(".")
-
-    for vcf in vcfs:
-        statement = """SnpSift annotate
-        -Xmx%(job_memory)s 
-        %(vcf)s
-        %(tempin)s.gz > %(tempout)s  
-        2>> %(outfile)s.log &&
-        mv %(tempout)s %(tempin)s &&
-        bgzip -f %(tempin)s &&
-        tabix -f %(tempin)s.gz"""
-        P.run(statement)
-
-    statement = '''  mv %(tempin)s.gz %(outfile)s.gz &&
-    tabix -p vcf %(outfile)s.gz''';
-    P.run(statement)
- 
-
-@transform(annotateVariants1000G,
-           regex(r"variants/all_samples.snpsift_5.vcf.gz"),
-           r"variants/all_samples.snpsift_final.vcf.gz")
-def annotateVariantsDBSNP(infile, outfile):
-    '''Add annotations using SNPsift'''
-    job_threads = PARAMS["annotation_threads"]
-    dbsnp = PARAMS["annotation_dbsnp"]
-    outfile = P.snip(outfile,".gz")
-    exome.snpSift(infile,outfile,dbsnp,memory=ANNOTATION_MEMORY)
-
-
-@follows(annotateVariantsDBSNP)
-def annotateVariantsSNPsift():
-    pass
-
-
-@transform(annotateVariantsDBSNP,
-           regex(r"variants/all_samples.snpsift_final.vcf.gz"),
-           r"variants/all_samples.vep.vcf.gz")
+           r"variants/all_samples.vep.txt.gz")
 def annotateVariantsVEP(infile, outfile):
    
     #Adds annotations as specified in the pipeline.yml using Ensembl
@@ -823,38 +727,69 @@ def annotateVariantsVEP(infile, outfile):
     
     # infile - VCF
     # outfile - VCF with vep annotations
-    job_memory = "6G"
-    job_threads = 4
+    job_memory = PARAMS["annotation_memory"]
+    job_threads = PARAMS["annotation_threads"]
 
     genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    vep_annotators = PARAMS["annotation_vepannotators"]
+    vep_options = PARAMS["annotation_vepoptions"]
     vep_cache = PARAMS["annotation_vepcache"]
     vep_species = PARAMS["annotation_vepspecies"]
     vep_assembly = PARAMS["annotation_vepassembly"]
     spliceAI_snv = PARAMS["annotation_spliceAI_snv"]
     spliceAI_indel = PARAMS["annotation_spliceAI_indel"]
+    dbNSFP_path = PARAMS["annotation_dbnsfp_path"]
+    dbNSFP_columns = PARAMS["annotation_dbnsfp_columns"]
+    if dbNSFP_columns is None:
+        dbNSFP_columns = "ALL"
+
     statement = '''vep
     -i %(infile)s
-    --cache --dir %(vep_cache)s --vcf
-    --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s
+    --cache --dir %(vep_cache)s --tab
     --species %(vep_species)s
-    --fork 2
+    --fork %(job_threads)s
     --compress_output bgzip
     --assembly %(vep_assembly)s
     -o %(outfile)s --force_overwrite'''
-    if vep_annotators is not None:
-        statement += "  %(vep_annotators)s" 
+    if dbNSFP_path is not None:
+        statement +='''
+        --plugin dbNSFP,%(dbNSFP_path)s,%(dbNSFP_columns)s'''
+    if spliceAI_snv is not None:
+        statement +='''
+        --plugin SpliceAI,snv=%(spliceAI_snv)s,indel=%(spliceAI_indel)s'''
+    if vep_options is not None:
+        statement += "  %(vep_options)s" 
     statement += ''' > %(outfile)s.log 2>&1;
     tabix -p vcf %(outfile)s'''
     P.run(statement)
 
 
+@transform(annotateVariantsVEP, regex("variants/all_samples.vep.vcf.gz"),
+           r'all_samples.filtered.vcf.gz')
+def filterVEP(infile, outfiles):
+    '''
+    Filter variants using vep-filter.
+    More docs 
+    '''
+    
+    filters = PARAMS["annotation"]
+
+    statement = '''filter_vep
+    --gz
+    -i %(infile)s
+    -o %(outfile)s
+    --only-matched
+    --force-overwrite
+    -f %(filter)s
+    '''
+    P.run(statement)
+
+
 @follows(mkdir("variant_tables"))
 @transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
-           add_inputs(annotateVariantsVEP), r"variant_tables/\1.tsv")
+           add_inputs(filterVEP), r"variant_tables/\1.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
-    Converts the multi sample vcf generated with Haplotype caller into
+    Converts the multi sample vcf into
     a single table for each sample contain only positions called as variants
     in that sample and all columns from the vcf.
     '''
@@ -898,7 +833,7 @@ def makeAnnotationsTables(infiles, outfile):
                    -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
                    %(inputvcf)s >> %(outfile)s'''
     P.run(statement)
-        
+
 
 ###############################################################################
 # Quality Filtering #
@@ -916,144 +851,25 @@ def qualityFilterVariants(infile, outfiles):
     '''
     qualstring = PARAMS['filtering_quality']
     qualfilter = PARAMS['filtering_quality_ft']
-    exome.filterQuality(infile, qualstring, qualfilter,
-                                outfiles, submit=True)
-
-
-@follows(mkdir("variant_tables_rare"))
-@transform(qualityFilterVariants, regex("variant_tables_highqual/(.*).tsv"),
-           [r'variant_tables_rare/\1.tsv',
-            r'variant_tables_rare/\1_failed.tsv'])
-def rarityFilterVariants(infiles, outfiles):
-    '''
-    Filter out variants which are common in any of the exac or other
-    population datasets as specified in the pipeline.yml.
-    '''
-    infile = infiles[0]
-    thresh = PARAMS['filtering_rarethresh']
-    freqs = PARAMS['filtering_freqs']
-    exac = PARAMS['filtering_exac']
-    exome.filterRarity(infile, exac, freqs, thresh, outfiles,
-                               submit=True)
-
-
-@follows(mkdir("variant_tables_damaging"))
-@transform(rarityFilterVariants, regex("variant_tables_rare/(.*).tsv"),
-           [r'variant_tables_damaging/\1.tsv',
-            r'variant_tables_damaging/\1_failed.tsv'])
-def damageFilterVariants(infiles, outfiles):
-    '''
-    Filter variants which have not been assessed as damaging by any
-    of the specified tools.
-    Tools and thresholds can be specified in the pipeline.yml.
-
-    Does not account for multiple alt alleles - if any ALT allele has
-    been assessed as damaging with any tool the variant is kept,
-    regardless of if this is the allele called in the sample.
-
-    '''
-    infile = infiles[0]
-    exome.filterDamage(infile, PARAMS['filtering_damage'], outfiles,
-                               submit=True)
-
-
-@follows(mkdir("variant_tables_family"))
-@transform(damageFilterVariants, regex("variant_tables_damaging/(.*).tsv"),
-           [r'variant_tables_family/\1.tsv',
-            r'variant_tables_family/\1_failed.tsv'])
-def familyFilterVariants(infiles, outfiles):
-    '''
-    Filter variants according to the output of calculateFamily -
-    only variants shared by both members of a family will be kept.
-    BROKEN - NEED TO ADD FAMILY INPUTS
-    '''
-    if len(matches) > 1:
-        infile = infiles[0][0]
-
-        infilenam = infile.split("/")[-1]
-        infilestem = "/".join(infile.split("/")[:-1])
-
-        # figure out who is related to who
-        families = [line.strip().split("\t")[:2]
-                    for line in iotools.open_file(infiles[1]).readlines()]
-        infam = [line[0] for line in families] + [line[1] for line in families]
-
-        # no relatives - copy the input file to the output file and generate
-        # a blank "failed" file
-        if infilenam not in infam or PARAMS['filtering_family'] == 0:
-            shutil.copy(infile, outfiles[0])
-            o = iotools.open_file(outfiles[1], "w")
-            o.close()
-        else:
-            for line in families:
-                if infilenam in line:
-                    i = line.index(infilenam)
-                    if i == 0:
-                        infile2 = "%s/%s" % (infilestem, line[1])
-                    else:
-                        infile2 = "%s/%s" % (infilestem, line[0])
-            exome.filterFamily(infile, infile2, outfiles)
+    outfile1 = outfiles[0]
+    outfile2 = outfiles[1]
+    
+    if qualstring is not None:
+        exome.filterQuality(infile, qualstring, qualfilter,
+                            outfiles, submit=True)
     else:
-        infile = infiles[0][0]
-        shutil.copy(infile, outfiles[0])
-        out = iotools.open_file(outfiles[1], "w")
-        out.close()
+        statement = '''cp %(infile)s %(outfile1)s &&
+        touch %(outfile2)s'''
+        P.run(statement)
 
 
-@follows(familyFilterVariants)
+@follows(filterVEP)
 def filter():
     pass
 
-
-@follows(mkdir("genelists"))
-@transform(familyFilterVariants, regex("variant_tables_family/(.*).tsv"),
-           [r'genelists/\1.tsv',
-            r'genelists/\1.bed'])
-def makeGeneLists(infiles, outfiles):
-    infile = infiles[0]
-    outfile = outfiles[1]
-    genes = PARAMS['general_geneset']
-
-    # four %s because they need to be escaped in generating the statement
-    # then again when submitting the P.run(statement)
-    statement = '''awk 'NR > 2 {printf("%%%%s\\t%%%%s\\t%%%%s\\n",\
-    $1, $2, $2 + 1)}'\
-    %(infile)s |\
-    bedtools intersect -wo -a stdin -b %(genes)s > %(outfile)s''' % locals()
-    P.run(statement)
-
-    geneids = set()
-    with iotools.open_file(outfile) as inp:
-        for line in inp:
-            line = line.strip().split("\t")
-            details = line[11].split(";")
-            for detail in details:
-                r = re.search('gene_id', detail)
-                if r:
-                    geneid = detail.split(" ")[-1]
-                    geneids.add(geneid.replace("\"", ""))
-    out = iotools.open_file(outfiles[0], "w")
-    for geneid in geneids:
-        out.write("%s\n" % geneid)
-    out.close()
-
-
-@follows(mkdir('final_variant_tables'))
-@collate((makeGeneLists, familyFilterVariants), regex('(.*)/(.*).tsv'),
-         r'final_variant_tables/\2.tsv')
-def finalVariantTables(infiles, outfile):
-    genes = infiles[0]
-    variants = infiles[1][0]
-    cols = PARAMS['filtering_columns'].split(",")
-    exome.CleanVariantTables(genes, variants, cols, outfile,
-                                     submit=True)
-
-###############################################################################
-###############################################################################
 ###############################################################################
 # Genes of interest
 
-
 @active_if(PARAMS["annotation_add_genes_of_interest"] == 1)
 @transform((annotateVariantsVEP),
            regex(r"variants/all_samples.snpsift.vcf"),
@@ -1099,49 +915,6 @@ def loadVariantAnnotation(infile, outfile):
     '''Load VCF annotations into database'''
     P.load(infile, outfile, options="--retry --ignore-empty")
 
-###############################################################################
-###############################################################################
-###############################################################################
-# coverage over candidate genes
-
-@merge(RemoveDuplicatesSample, "gatk/all_samples.list")
-def listOfBAMs(infiles, outfile):
-    '''generates a file containing a list of BAMs for use in VQSR'''
-    with iotools.open_file(outfile, "w") as outf:
-        for infile in infiles:
-            outf.write(infile + '\n')
-
-@active_if(PARAMS["coverage_calculate"] == 1)
-@transform(listOfBAMs, regex(r"gatk/all_samples.list"), r"candidate.sample_interval_summary")
-def candidateCoverage(infile, outfile):
-    '''Calculate coverage over exons of candidate genes'''
-    all_exons = PARAMS["coverage_all_exons"]
-    candidates = PARAMS["coverage_candidates"]
-    candidates = candidates.replace(",", " -e ")
-    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
-    threshold = PARAMS["coverage_threshold"]
-    statement = '''zcat %(all_exons)s | grep -e %(candidates)s
-                   | awk '{print $1 ":" $4 "-" $5}' - | sed 's/chr//' - >
-                   candidate.interval_list ; ''' % locals()
-    statement += '''zcat %(all_exons)s | grep -e %(candidates)s
-                    | awk '{print $16}' - | sed 's/"//g;s/;//g' - >
-                    candidate_gene_names.txt ;''' % locals()
-    statement += '''GenomeAnalysisTK -T DepthOfCoverage -R %(genome)s
-                    -o candidate -I %(infile)s -ct %(threshold)s -L candidate.interval_list ;''' % locals()
-    P.run(statement)
-
-###############################################################################
-
-
-@active_if(PARAMS["coverage_calculate"] == 1)
-@transform(candidateCoverage, regex(r"candidate.sample_interval_summary"), r"candidate_coverage_plot.pdf")
-def candidateCoveragePlots(infile, outfile):
-    '''Produce plots of coverage'''
-    rscript = PARAMS["coverage_rscript"]
-    threshold = PARAMS["coverage_threshold"]
-    statement = '''Rscript %(rscript)s %(infile)s candidate_gene_names.txt %(threshold)s %(outfile)s ;'''
-    P.run(statement)
-
 ###############################################################################
 ###############################################################################
 ###############################################################################
@@ -1188,8 +961,7 @@ def loadVCFstats(infiles, outfile):
 # Targets
 
 
-@follows(loadVariantAnnotation,
-         finalVariantTables)
+@follows(loadVariantAnnotation)
 def testFromVariantRecal():
     pass
 
@@ -1221,10 +993,8 @@ def callVariants():
     pass
 
 
-@follows(loadTableSnpEff,
-         listOfBAMs,
-         loadVariantAnnotation,
-         finalVariantTables)
+@follows(listOfBAMs,
+         loadVariantAnnotation)
 def annotation():
     pass
 
diff --git a/cgatpipelines/tools/pipeline_exome/pipeline.yml b/cgatpipelines/tools/pipeline_exome/pipeline.yml
index 0f7893bee..ec86a4889 100644
--- a/cgatpipelines/tools/pipeline_exome/pipeline.yml
+++ b/cgatpipelines/tools/pipeline_exome/pipeline.yml
@@ -20,13 +20,9 @@ samples: ''
 web_dir: ../web
 
 # If you are not running exome data but targetted sequencing on a smaller region then change to 1
+# Effect of this option is to skip base quality recalibration step
 targetted: 0
 
-#pedfile=/path/to/pedfile
-
-# seed for random number generation
-randomseed: 1000
-
 # gtf file containing exon annotations
 geneset: /path/to/ensembl.dir/geneset_all.gtf.gz
 
@@ -48,7 +44,7 @@ bwa:
     mem_options: ''
 
     # BWA end-pairing parameters
-    sampe_options: ''
+    sample_options: ''
 
 readgroup:
 
@@ -60,10 +56,6 @@ readgroup:
 
     sample: blood
 
-dedup:
-
-    method: picard
-
 gatk:
 
     threads: 8
@@ -75,105 +67,65 @@ gatk:
     # 1.2x this memory will be requested from the cluster to provide sufficent memory for the database tool.
     dbmem: 20G
 
+    # path to dbsnp VCF (provided by GATK)
     dbsnp: /path/to/dbsnp.vcf
 
+    # path to hapmap VCF (provided by GATK)
     hapmap: /path/to/hapmap.vcf
 
-    omni: /path/to/1000G_omni2.5.hg19.sites.vcf
+    # path to OMNI VCF (privided by GATK)
+    omni: /path/to/1000G_omni
 
-    kgenomes: /path/to/1000G_phase1.snps.high_confidence.hg19.sites.vcf
+    # path to 1000G VDF (provided by GATK)
+    kgenomes: /path/to/1000G_phase1.snps.high_confidence
 
-    mills: /path/to/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
+    mills: /path/to/Mills_and_1000G_gold_standard
 
-    axiom: /path/to/axioum
+    axiom: /path/to/axiom
 
-    solid_options: ''
+    # Additional options for GATK baserecalibrator
+    baserecalibrator_options: ''
 
+    # Additional options for GATK HaplotypeCaller
     hc_options: -A StrandOddsRatio -A DepthPerSampleHC
 
+    # Additional options for GATK GenotypeGVCFs
     genotype_options: ''
 
     vcf_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F QUAL -F FILTER -F AC -F AF -F AN -F BaseQRankSum -F DB -F DP -F Dels -F FS -F HaplotypeScore -F MLEAC -F MLEAF -F MQ -F MQ0 -F MQRankSum -F QD -F ReadPosRankSum -F SB -F SNPEFF_EFFECT -F SNPEFF_IMPACT -F SNPEFF_FUNCTIONAL_CLASS -F SNPEFF_CODON_CHANGE -F SNPEFF_AMINO_ACID_CHANGE -F SNPEFF_GENE_NAME -F SNPEFF_GENE_BIOTYPE -F SNPEFF_TRANSCRIPT_ID -F SNPEFF_EXON_ID -F dbNSFP_GERP++_RS -F dbNSFP_GERP++_NR -F dbNSFP_Ensembl_transcriptid -F dbNSFP_Uniprot_acc -F dbNSFP_Interpro_domain -F dbNSFP_SIFT_score -F dbNSFP_Polyphen2_HVAR_pred -F dbNSFP_29way_logOdds -F dbNSFP_1000Gp1_AF -F dbNSFP_1000Gp1_AFR_AF -F dbNSFP_1000Gp1_EUR_AF -F dbNSFP_1000Gp1_AMR_AF -F dbNSFP_1000Gp1_ASN_AF -F dbNSFP_ESP6500_AA_AF -F dbNSFP_ESP6500_EA_AF -F RSPOS -F SSR -F SAO -F VP -F VC -F PM -F TPA -F PMC -F MUT -F VLD -F OTHERKG -F PH3 -F CDA -F MTP -F OM -F CAF -F COMMON -GF GT -GF AD -GF GQ -GF PL -GF PQ -GF TP -GF AB -GF DP
 
-hapmap:
-
-    loc: /path/to/hapmap/genotype_frequencies
-
-    # bgzip compressede and tabix indexed VCF file for Hapmap
-    vcf: /path/to/hapmap_3.3.hg19.sites.vcf
-
-    padding: 0
-
-    hc_options: --emitRefConfidence GVCF
 
+# annotation is done using Ensembl! VEP
 annotation:
 
     memory: 6G
 
-    threads: 1
-
-    #Config file specifies that the genome must be downloaded into your home directory
-    snpeff_config: path/to/snpEff-VERSION/snpEff.config
-
-    snpeff_genome: hg38
+    threads: 4
 
-    # path to dbNSFP database
-    dbnsfp: /path/to/dbNFSP.txt.gz
+    # path to dbNSFP database (aggregation database for annotations)
+    # leave blank if not required
+    dbnsfp_path: /path/to/dbNFSP.txt.gz
 
     # dbNSFP columns to include
-    # Default (if left empty): Uniprot_acc, Interpro_domain, 
+    # If left empty will fetch ALL annotations.
+    # Examples: Uniprot_acc, Interpro_domain, 
     # SIFT_pred, Polyphen2_HDIV_pred, Polyphen2_HVAR_pred, 
     # LRT_pred, MutationTaster_pred, GERP++_NR, GERP++_RS, 
-    # phastCons100way_vertebrate, 1000Gp1_AF, 1000Gp1_AFR_AF, 
-    # 1000Gp1_EUR_AF, 1000Gp1_AMR_AF, 1000Gp1_ASN_AF, 
+    # phastCons100way_vertebrate, 1000Gp1_AF, 1000Gp1_AFR_AF
     # ESP6500_AA_AF, ESP6500_EA_AF, MutationTaster_pred, 
     # MutationAssessor_pred, FATHMM_pred, PROVEAN_pred, 
     # CADD_phred, MetaSVM_pred, 1000Gp3_AC, 1000Gp3_AF, 
-    # 1000Gp3_AFR_AC, 1000Gp3_AFR_AF, 1000Gp3_EUR_AC, 
-    # 1000Gp3_EUR_AF, 1000Gp3_AMR_AC, 1000Gp3_AMR_AF, 
-    # 1000Gp3_EAS_AC, 1000Gp3_EAS_AF, 1000Gp3_SAS_AC, 
-    # 1000Gp3_SAS_AF, ESP6500_AA_AC, ESP6500_AA_AF, 
-    # ESP6500_EA_AC, ESP6500_EA_AF, ExAC_AC, ExAC_AF, 
-    # ExAC_Adj_AC, ExAC_Adj_AF, ExAC_AFR_AC, ExAC_AFR_AF, 
-    # ExAC_AMR_AC, ExAC_AMR_AF, ExAC_EAS_AC, 
-    # ExAC_EAS_AF, ExAC_FIN_AC, ExAC_FIN_AF, ExAC_NFE_AC, 
-    # ExAC_NFE_AF, ExAC_SAS_AC, ExAC_SAS_AF
-    dbnsfp_columns:  
-
-    # path to clinvar vcf
-    clinvar: /path/to/clinvar_DATE.vcf.gz
-
-    # path to dbsnp vcf
-    dbsnp: /path/to/All_20160527.vcf.gz
-
-    # path to chromosome by chromosome 1000g vcfs
-    tgdir: /path/to/1000_genomes/phase3
-
-    # path to gwas catalog files
-    gwas_catalog: /path/to/gwas_catalog_v1.0-downloaded_2015-07-14.txt
-
-    # path to "phastcons100way" files
-    phastcons: /path/to/phastcons
-
-    snpeff_to_table: -F CHROM -F POS -F ID -F REF -F ALT -F EFF
-
-    add_genes_of_interest: 0
-
-    genes_of_interest: ''
-
-    # vep parameters - see vep docs here
-    # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
-
-    # options to pass to vep - this string is appended directly
-    # to the command when vep is called so should be formatted
-    # e.g. as
-    # --sift b --regulatory --domains
-    vepannotators: --sift b --variant_class --regulatory --domains
+    # 1000Gp3_AFR_AC, 1000Gp3_AFR_AF, 1000Gp3_EUR_AC,
+    # ESP6500_EA_AC, ESP6500_EA_AF, ExAC_AC, ExAC_AF,
+    # ExAC_Adj_AC, ExAC_Adj_AF, ExAC_AFR_AC, ExAC_AFR_AF
+    dbnsfp_columns:
 
     # path to VEP scripts
     veppath: /path/to/ensembl-vep-80/variant_effect_predictor
 
     # location of cached annotations
+    # cached annotations should have the same version number
+    # as the VEP executable 
     vepcache: /path/to/VEP
 
     # species name
@@ -182,9 +134,40 @@ annotation:
     # assembly - needs to correspond to data in cache directory
     vepassembly: GRCh38
 
-    # filtering of vcfs to select variants of interest
+    # vep parameters - see vep docs here
+    # http://www.ensembl.org/info/docs/tools/vep/script/vep_options.html
+    # format as for command line e.g. as --sift b --regulatory --domains
+    # Option -e covers almost all annotations
+    vepoptions: -e
+
+    # Path to Somalier package (currently not available as conda package)
+    # Website: https://github.com/brentp/somalier
+    somalier_path: /path/to/somalier
+
+    # Path to Somalier VCF specifying sites used for determination of relatedness
+    # Provided by Somalier Project
+    somalier_sites: /path/to/sites.hg38.vcf.gz
+
+    # Path to 1000G data folder (provided by Somalier project)
+    somalier_1kg: /path/to/1kg-somalier
+
+    # Path to Somalier Ancestry Labels for 1000G (provided by Somalier project)
+    somalier_labels: /path/to/ancestry-labels-1kg.tsv
+
+    add_genes_of_interest: 0
+
+    genes_of_interest: ''
+
+
+# filtering of vcfs to select variants of interest
+# filtering is done using 
 filtering:
 
+    # Quality Filtering can be done either by VQSR, a GATK machine-learning
+    # algorithm, or by hard-filtering QC scores based on prespecified conditions
+    quality: [VQSR OR hard]
+
+    # 
     # quality filters, seperated by commas, as column'symbol'score where
     # column is the column to filter
     # symbol is '>', '<', '>=', '<=' (including quote marks)
@@ -198,31 +181,10 @@ filtering:
     # any = keep variants filling any of the quality criteria specified above
     quality_ft: all
 
-    rarethresh: 0.01
-
-    # columns containing comma delimiated allele frequencies to filter by
-    freqs: [EUR_AF,EAS_AF,AFR_AF,AMR_AF,SAS_AF,ESP_AF_POPMAX,ESP_AF_GLOBAL]
-
-    #populations to use when filtering Exac allele frequencies
-    #xxx for any columns where an AC_xxx and AN_xxx column is provided in the
-    #Exac VCF, e.g. Adj will calculate allele frequency from
-    # the AC_Adj and AN_Adj columns
-    # currently available Adj (all populations), POPMAX (highest af in any population),
-    # specific populations e.g. EUR, AFR and others
-    exac: [Adj,POPMAX]
-
-    # columns containing information about predicted damage from this mutation
-    # and strings to search for in these columns
-    # columns containing the strings are kept
-    # specified as column|string-string-...,column|string...
-    # e.g. SNPEFF_IMPACT|HIGH-MODERATE,CSQ|HIGH would look for
-    # "HIGH" or "MODERATE" in the SNPEFF_IMPACT column and
-    # "HIGH" in the CSQ column
-    damage: [CSQ|HIGH,dbNSFP_MutationTaster_pred|A-D,dbNSFP_Polyphen2_HDIV_pred|D-P,dbNSFP_Polyphen2_HVAR_pred|D-P,dbNSFP_SIFT_pred|D,SNPEFF_IMPACT|HIGH]
-
-    # if family is 1, keep only variants shared by any family members identified in
-    # the calculateFamily step
-    family: 1
+    # VEP filtering
+    # Provide filtering command for filter_vep.
+    # Can be multiple commands separated
+    vep: 
 
     # columns to keep in the final variant tables (apart from CHROM,POS,ID,REF1,ALT,GT)
     columns: [GQ,DP,FS,SOR,POPMAX_calc,Adj_calc,EUR_AF_calc,EAS_AF_calc,AFR_AF_calc,SAS_AF_calc,ESP_AF_POPMAX_calc,ESP_AF_GLOBAL_calc,CSQ,dbNSFP_MutationTaster_pred,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_SIFT_pred,SNPEFF_IMPACT]
@@ -230,10 +192,10 @@ filtering:
 coverage:
 
     # to activate calculating coverage of candidate genes, specified below.
-    calculate: 0
+    calculate: 1
 
     # supply gtf
-    all_exons: /path/to/hg19_ensembl75/geneset_coding_exons.gtf.gz
+    all_exons: /path/to/geneset_coding_exons.gtf.gz
 
     # comma-separated list of canonical ensembl identifiers for candidate genes of interest
     candidates: ''

From 3231f3166d405a25989359e20335b01414ce0c6a Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Fri, 31 Mar 2023 16:48:45 +0100
Subject: [PATCH 32/43] various improvements to diff expression pipeline - able
 to handle unexpected data

---
 cgatpipelines/Rtools/diffexpression.R                | 8 +++++---
 cgatpipelines/Rtools/exploratory.R                   | 7 +++++--
 cgatpipelines/Rtools/filtercounts.R                  | 9 +++++++--
 cgatpipelines/tools/pipeline_rnaseqdiffexpression.py | 3 ++-
 4 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R
index b69374312..20fda35ff 100644
--- a/cgatpipelines/Rtools/diffexpression.R
+++ b/cgatpipelines/Rtools/diffexpression.R
@@ -35,7 +35,7 @@ getmart <- function(values){
   return(data)
 }
 
-start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
+start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
     file = get_output_filename(paste0(outdir,"/",section, ".", type))
     Cairo(file = file,
           type = type,
@@ -78,7 +78,8 @@ plotTPMs <- function(dftemp, contrast_name){
     ggplot(aes(x = contrast, y = value, color = contrast)) +
     geom_point(position = position_jitter(w = 0.15, h = 0)) +
     facet_wrap(~ var, scales = "free") + theme_bw() +
-    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
 }
 
 
@@ -114,7 +115,7 @@ run <- function(opt) {
   flog.info("... plotting MA")
   ## MA Plot
   start_plot("MAPlot", outdir=opt$outdir)
-    DESeq2::plotMA(resLFC, ylim = c(-3,3))
+    DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
   end_plot()
   
   flog.info("... saving DE data")
@@ -178,6 +179,7 @@ run <- function(opt) {
 
   flog.info("... plotting user-defined genes")
   genelist <- unlist(strsplit(opt$userlist, ","))
+  genelist <- genelist[genelist %in% rownames(dds)]
   dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
   start_plot("Userdefined", outdir=opt$outdir)
     print(plotTPMs(dftemp, opt$contrast))
diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R
index ce966434d..620522e50 100755
--- a/cgatpipelines/Rtools/exploratory.R
+++ b/cgatpipelines/Rtools/exploratory.R
@@ -146,7 +146,7 @@ run <- function(opt) {
   names(variable.group) <- opt$contrast
   scores <- data.frame(variable.group, pca$x[,1:dim_pca])
   start_plot(paste0('PCA_grid'), outdir=opt$outdir)
-  if(dim_pca>7){
+  if(dim_pca==8){
     print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
       geom_point(shape = 16, size = 0.5, position = 'auto') + 
       geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
@@ -187,7 +187,9 @@ run <- function(opt) {
   # Heatmap of Genes of interest
   if (!is.null(opt$genes_of_interest)) {
     print(rownames(assay(vsd)))
-    mat <- assay(vsd)[opt$genes_of_interest, ]
+    goi <- opt$genes_of_interest
+    goi <- goi[goi %in% rownames(assay(vsd))]
+    mat <- assay(vsd)[goi, ]
     mat <- mat - rowMeans(mat)
     temp <- getmart(rownames(mat))
     row.names(temp) <- temp$ensembl_gene_id
@@ -227,6 +229,7 @@ run <- function(opt) {
   
   ### EXPLORE BATCH EFFECTS ###
   futile.logger::flog.info(paste("Exploring Batch Effects"))
+  futile.logger::flog.info(dim_pca)
   if(dim_pca == 8){
     for(factor in opt$factors){
       factor_transformed <- vsd
diff --git a/cgatpipelines/Rtools/filtercounts.R b/cgatpipelines/Rtools/filtercounts.R
index 00ff92ef9..6f8d7ebd7 100755
--- a/cgatpipelines/Rtools/filtercounts.R
+++ b/cgatpipelines/Rtools/filtercounts.R
@@ -28,9 +28,10 @@ run <- function(opt) {
   ### READING DATA ###
   # Read in sampleData Table
   futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
-  sampleData <- read.table(opt$sampleData, header = TRUE)
+  sampleData <- read_tsv(opt$sampleData)
   sampleData <-sampleData[sampleData$include ==1, ]
   futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
+  futile.logger::flog.info(paste(sampleData))
   rownames(sampleData) <- sampleData$track
 
   futile.logger::flog.info(paste("reading in data from ", opt$source))
@@ -81,7 +82,8 @@ run <- function(opt) {
       flattenedfile=opt$flattenedFile)
   } else if(opt$source == "counts_table"){
     # Read in Data
-    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1)
+    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
+    futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
     experiment_tsv <- raw[,sampleData$track,drop=FALSE]
     if(opt$method == "deseq2"){
       dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
@@ -107,6 +109,9 @@ run <- function(opt) {
       futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
       keep <- rowSums(counts(dataset)) >= 10
       dataset <- dataset[keep,]
+      #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
+      #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
+      #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
       counts_table <- counts(dataset)
       futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
     } else if(opt$method == "dexseq"){
diff --git a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
index 552775a82..a9e80c85e 100644
--- a/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
+++ b/cgatpipelines/tools/pipeline_rnaseqdiffexpression.py
@@ -539,7 +539,7 @@ def buildSalmonIndex(infile, outfile):
        path to output file
     '''
 
-    job_memory = "30G"
+    job_memory = "40G"
     threads = 12
     if PARAMS["salmon_threads"] is not None:
         threads = PARAMS["salmon_threads"]
@@ -1037,6 +1037,7 @@ def count():
 def filterDESeq2(infiles, outfile, design_name, quantifier_name):
     ''' Load counts into RDS object and filter'''
 
+    job_memory = '20G'
     counts, design = infiles
     transcripts, genes = counts
     quantifier_name = quantifier_name.lower()

From af5661a2cff035f23f2dc7172e1e82df328a4cd0 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 20 Sep 2023 09:44:13 +0100
Subject: [PATCH 33/43] Extensive changes to exome pipeline, bamstats and
 exploratory analysis

---
 cgatpipelines/Rtools/exploratory.R       |  21 +--
 cgatpipelines/tasks/bamstats.py          |  14 +-
 cgatpipelines/tools/pipeline_bamstats.py |  20 +++
 cgatpipelines/tools/pipeline_exome.py    | 171 +++++++++++++++++++----
 cgatpipelines/tools/pipeline_genesets.py |   2 +-
 5 files changed, 175 insertions(+), 53 deletions(-)

diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R
index 620522e50..72a7b2765 100755
--- a/cgatpipelines/Rtools/exploratory.R
+++ b/cgatpipelines/Rtools/exploratory.R
@@ -74,26 +74,7 @@ run <- function(opt) {
     dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
   }
   futile.logger::flog.info(paste("reading  Experiment object", paste(dim(counts(experiment)), collapse = ",")))
-  
-  ### SVA - ANALYSIS OF BIASES ###
-  futile.logger::flog.info(paste("Performing Surrogate Variable Analysis"))
-  dds <- estimateSizeFactors(dds)
-  dat  <- counts(dds, normalized = TRUE)
-  idx  <- rowMeans(dat) > 1
-  dat  <- dat[idx, ]
-  mod  <- model.matrix(formula(opt$model), colData(dds))
-  mod0 <- model.matrix(~ 1, colData(dds))
-  svseq <- svaseq(dat, mod, mod0, n.sv = 2)
-  for(factor in opt$factors){
-    start_plot(paste0('SVA for ', factor), outdir=opt$outdir)
-    par(mfrow = c(2, 1), mar = c(3,5,3,1))
-    for (i in 1:2) {
-      stripchart(svseq$sv[, i] ~ colData(dds)[, factor], vertical = TRUE, main = paste0("SV", i))
-      abline(h = 0)
-    }
-    end_plot()
-  }
-  
+
   ### TRANSFORMATION OF DATA ###
   futile.logger::flog.info(paste("Transforming data"))
   rld<- rlog(dds)
diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py
index e67ca4916..a31367a47 100644
--- a/cgatpipelines/tasks/bamstats.py
+++ b/cgatpipelines/tasks/bamstats.py
@@ -67,7 +67,7 @@ def buildPicardInsertSizeStats(infile, outfile, genome_file,
         Filename with genomic sequence.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -159,7 +159,7 @@ def buildPicardAlignmentStats(infile, outfile, genome_file,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -191,7 +191,7 @@ def buildPicardDuplicationStats(infile, outfile, picardmem):
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -244,7 +244,7 @@ def buildPicardDuplicateStats(infile, outfile, picardmem):
         Output filename with picard output.
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -279,7 +279,7 @@ def buildPicardCoverageStats(infile, outfile, baits, regions,
     '''
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -310,7 +310,7 @@ def buildPicardGCStats(infile, outfile, genome_file, picardmem):
     """
 
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
 
     if BamTools.getNumReads(infile) == 0:
@@ -1291,7 +1291,7 @@ def buildPicardRnaSeqMetrics(infiles, strand, outfile, picardmem):
 
     '''
     job_memory = picardmem
-    picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals()
+    picard_opts = '-Xmx%(job_memory)s -XX:+UseG1GC ' % locals()
     job_threads = 3
     infile, genome, rRNA_intervals = infiles
 
diff --git a/cgatpipelines/tools/pipeline_bamstats.py b/cgatpipelines/tools/pipeline_bamstats.py
index 5df60e88f..2f4b802ce 100644
--- a/cgatpipelines/tools/pipeline_bamstats.py
+++ b/cgatpipelines/tools/pipeline_bamstats.py
@@ -356,6 +356,26 @@ def buildPicardDuplicationStats(infile, outfile):
                                          PICARD_MEMORY)
 
 
+@follows(mkdir("RSeQC.dir"))
+@follows(countReads)
+@transform(intBam,
+           regex("BamFiles.dir/(.*).bam$"),
+           r"RSeQC.dir/\1.read_distribution")
+def runRSeQC(infile, outfile):
+
+    bed_file = "/ceph/project/talbotlab/jscaber/iPS-C9TrapYan/bamstats-fullgenome/hg38_GENCODE.v38.bed"
+
+    job_memory = "6G"
+
+    statement = '''
+    read_distribution.py
+         -i %(infile)s
+         -r %(bed_file)s
+    > %(outfile)s
+    '''
+
+    P.run(statement)
+
 @follows(mkdir("BamStats.dir"))
 @follows(countReads)
 @transform(intBam,
diff --git a/cgatpipelines/tools/pipeline_exome.py b/cgatpipelines/tools/pipeline_exome.py
index cb79d5137..1c2c06bdc 100644
--- a/cgatpipelines/tools/pipeline_exome.py
+++ b/cgatpipelines/tools/pipeline_exome.py
@@ -164,7 +164,7 @@
 from ruffus import transform, mkdir, follows, merge, regex, suffix, \
     jobs_limit, files, collate, add_inputs, formatter, \
     active_if, originate, subdivide
-from ruffus.combinatorics import permutations
+from ruffus.combinatorics import permutations, product
 import sys
 import os
 import csv
@@ -201,6 +201,8 @@
 
 PANELS = glob.glob("*.panel.tsv")
 
+FILTERS = glob.glob ("*.filter.tsv")
+
 
 ###############################################################################
 ###############################################################################
@@ -277,7 +279,7 @@ def createEnsemblDictionary(outfile):
 @subdivide([x for x in PANELS],
            regex("(\S+).panel.tsv"),
            add_inputs(createEnsemblDictionary),
-           r"\1.bed")
+           r"panels/\1.bed")
 def createPanelBed(infiles, outfile):
 
     geneset = PARAMS['geneset']
@@ -713,13 +715,34 @@ def ancestrySomalier(infile,outfile):
     P.run(statement)
 
 
+###############################################################################
+# Expansionhunter
+
+
+@follows(mkdir("expansionhunter"))
+@transform(RemoveDuplicatesSample, regex(r"gatk/(\S+).dedup2.bam"),
+           r"expansionhunter/\1.vcf")
+def expansionHunter(infile, outfile):
+    
+    genome = PARAMS["genome_dir"] + "/" + PARAMS["genome"] + ".fa"
+    catalog = PARAMS["expansionhunter"]
+    outfile = P.snip(outfile)
+
+    statement = '''ExpansionHunter
+    --reads %(infile)s
+    --reference %(genome)s
+    --variant-catalog %(catalog)s
+    --output-prefix %(outfile)s
+    > %(outfile)s.log 2>&1
+    '''
+    P.run(statement, job_condaenv="expansionhunter")
 
 ###############################################################################
 # VEP
 
 @transform(applyVQSRIndels,
            regex(r"variants/all_samples.vqsr.vcf.gz"),
-           r"variants/all_samples.vep.txt.gz")
+           r"variants/all_samples.vep.vcf.gz")
 def annotateVariantsVEP(infile, outfile):
    
     #Adds annotations as specified in the pipeline.yml using Ensembl
@@ -744,11 +767,12 @@ def annotateVariantsVEP(infile, outfile):
 
     statement = '''vep
     -i %(infile)s
-    --cache --dir %(vep_cache)s --tab
+    --cache --dir %(vep_cache)s --vcf
     --species %(vep_species)s
     --fork %(job_threads)s
     --compress_output bgzip
     --assembly %(vep_assembly)s
+    --plugin SameCodon
     -o %(outfile)s --force_overwrite'''
     if dbNSFP_path is not None:
         statement +='''
@@ -763,30 +787,62 @@ def annotateVariantsVEP(infile, outfile):
     P.run(statement)
 
 
-@transform(annotateVariantsVEP, regex("variants/all_samples.vep.vcf.gz"),
-           r'all_samples.filtered.vcf.gz')
-def filterVEP(infile, outfiles):
+
+@subdivide([x for x in FILTERS],
+           regex("(\S+).filter.tsv"),
+           add_inputs(annotateVariantsVEP),
+           r"variants/all_samples.\1.vcf.gz")
+def filterVEP(infiles, outfile):
     '''
     Filter variants using vep-filter.
-    More docs 
+    More docs
     '''
     
-    filters = PARAMS["annotation"]
+    filter_file,infile = infiles
+    with open(filter_file) as f:
+        filters = f.read()
+    outfile = P.snip(outfile, ".gz")
 
     statement = '''filter_vep
     --gz
     -i %(infile)s
     -o %(outfile)s
-    --only-matched
-    --force-overwrite
-    -f %(filter)s
+    --only_matched
+    --force_overwrite
+    -f "%(filters)s";
+    > %(outfile)s.log  2>&1;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz
+    '''
+    P.run(statement)
+
+
+@product(filterVEP, formatter("variants/all_samples.(?P<FILTER>.*).vcf.gz$"),
+         createPanelBed, formatter("panels/(?P<PANEL>.*).bed$"),
+         "results/{FILTER[0][0]}/{PANEL[1][0]}.vcf.gz")
+def filterPanel(infiles, outfile):
+    '''
+    Filter variants using vep-filter.
+    More docs
+    '''
+    vcf, panelbed = infiles
+    outfile = P.snip(outfile, ".gz")
+
+    statement = '''gatk
+    SelectVariants
+    -V %(vcf)s
+    -O %(outfile)s
+    -L %(panelbed)s
+    > %(outfile)s.log  2>&1;
+    bgzip %(outfile)s;
+    tabix -p vcf %(outfile)s.gz
     '''
     P.run(statement)
 
 
-@follows(mkdir("variant_tables"))
-@transform(RemoveDuplicatesSample, regex(r"gatk/(.*).dedup2.bam"),
-           add_inputs(filterVEP), r"variant_tables/\1.tsv")
+@product(RemoveDuplicatesSample, formatter("gatk/(?P<TRACK>.*).dedup2.bam$"),
+         filterPanel, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*).vcf.gz$"),
+         "results/{FILTER[1][0]}/{PANEL[1][0]}/{TRACK[0][0]}.tsv")
 def makeAnnotationsTables(infiles, outfile):
     '''
     Converts the multi sample vcf into
@@ -814,7 +870,8 @@ def makeAnnotationsTables(infiles, outfile):
             cols2.append(line.split("\t")[0])
     l1 = "\t".join(cols2)
     l2 = "\t".join(colds)
-    out = open(outfile, "w")
+    os.unlink(TF)
+    out = open(TF, "w")
     out.write('''CHROM\tPOS\tQUAL\tID\tFILTER\tREF1\tALT\tGT\t%s\n\
                  chromosome\tposition\tquality\tid\tfilter\tref\talt\t\
                  genotype\t%s\n''' % (l1, l2))
@@ -822,19 +879,83 @@ def makeAnnotationsTables(infiles, outfile):
     cstring = "\\t".join(cols)
     cstring = "%CHROM\\t%POS\\t%QUAL\\t%ID\\t%FILTER\\t%REF\\t\
                %ALT\\t[%GT]\\t" + cstring
-    if PARAMS['test'] == 1:
-        statement = '''bcftools query
-                   -f '%(cstring)s\\n'
-                   -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
-                   %(inputvcf)s >> %(outfile)s'''
+    statement = '''bcftools query -s %(samplename)s
+                -f '%(cstring)s\\n'
+                -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
+                %(inputvcf)s >> %(TF)s'''
+    P.run(statement)
+
+    maxInt = sys.maxsize
+    while True:
+        # decrease the maxInt value by factor 10 
+        # as long as the OverflowError occurs.
+        try:
+            csv.field_size_limit(maxInt)
+            break
+        except OverflowError:
+            maxInt = int(maxInt/10)
+
+    df = pd.read_table(TF, sep = '\t', engine='python', quoting=3)
+    CSQ_index = df['CSQ'][0].split("Format:_")[-1].rstrip('\"')
+    df = df.drop(0)
+
+    if len(df)==0:
+        df.drop('CSQ',axis=1,inplace=True)
+        temp_df = pd.DataFrame(columns=CSQ_index.split("|"))
     else:
-        statement = '''bcftools query -s %(samplename)s
-                   -f '%(cstring)s\\n'
-                   -i 'FILTER=="PASS" && GT!="0/0" && GT!="./."'
-                   %(inputvcf)s >> %(outfile)s'''
+        df['CSQ'] = df.CSQ.str.split(",")
+        df = df.explode('CSQ').reset_index(drop=True)
+        temp_df = df.CSQ.str.split("|", expand = True)
+        temp_df.columns = CSQ_index.split("|")
+        df.drop('CSQ',axis=1,inplace=True)
+    df = df.join(temp_df, rsuffix="_VEP")
+    if "SYMBOL" in df.columns:
+        col = df.pop("SYMBOL")
+        df.insert(0, "SYMBOL", col)
+    if "HGVSp_VEP" in df.columns:
+        col = df.pop("HGVSp_VEP")
+        df.insert(0, "HGVSp_VEP", col)
+    if  len(df)!=0 and "CANONICAL" in df.columns and "PolyPhen" in df.columns:
+        # Select canonical variant, then most deleterious
+        df_final = df.sort_values(by = ['CANONICAL', 'PolyPhen'], ascending = [False, False]).drop_duplicates(["CHROM", "POS"])
+        df_final.to_csv(outfile, sep="\t")
+        outfile_full = P.snip(outfile, ".tsv") + ".tsv_full"
+        df.to_csv(outfile_full, sep="\t")
+    else:
+        df.to_csv(outfile, sep="\t")
+    os.unlink(TF)
+
+
+
+@collate(makeAnnotationsTables, formatter("results/(?P<FILTER>.*)/(?P<PANEL>.*)/.*.tsv$"),
+         "results/{FILTER[0]}/{PANEL[0]}_merged.tsv")
+def mergeAnnotationsTables(infiles, outfile):
+
+    result = False
+    for infile in infiles:
+        lines = 0
+        with open(infile) as f:
+            for line in f:
+                lines = lines + 1
+        if lines > 1:
+            result = True
+            break
+
+    path = os.path.dirname(infiles[1])
+    infiles = " ".join(infiles)
+
+    if result == True:
+        statement = ''' cgat tables2table %(infiles)s
+        --cat patient_id
+        --regex-filename '%(path)s/(\S+).tsv'
+        | awk '!/^#/{print > "%(outfile)s" ;next} 1'
+        > %(outfile)s.log 2>&1'''
+    else:
+        statement = '''touch %(outfile)s'''
     P.run(statement)
 
 
+
 ###############################################################################
 # Quality Filtering #
 
diff --git a/cgatpipelines/tools/pipeline_genesets.py b/cgatpipelines/tools/pipeline_genesets.py
index 20553b932..aaae674f6 100644
--- a/cgatpipelines/tools/pipeline_genesets.py
+++ b/cgatpipelines/tools/pipeline_genesets.py
@@ -638,7 +638,7 @@ def buildCdsTranscript(infile, outfile):
 
 
 @transform(buildUCSCGeneSet,
-           regex("*.gtf.gz"),
+           regex(".*.gtf.gz"),
            PARAMS['interface_geneset_exons_gtf'])
 def buildExonTranscript(infile, outfile):
     '''

From a0524b9283f08221a14b6acdb2343fef162521cc Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 20 Sep 2023 09:45:16 +0100
Subject: [PATCH 34/43] chages to exome.py

---
 cgatpipelines/tasks/exome.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/cgatpipelines/tasks/exome.py b/cgatpipelines/tasks/exome.py
index edee873fc..e9521ba2e 100644
--- a/cgatpipelines/tasks/exome.py
+++ b/cgatpipelines/tasks/exome.py
@@ -337,21 +337,6 @@ def vcfToTable(infile, outfile, genome, columns, gatkmem):
 ##############################################################################
 
 
-def snpSift(infile, outfile, vcf, bgzip=True, memory="6G"):
-    '''Annotate VCF with other VCF'''
-    job_memory=memory
-    statement = '''SnpSift annotate 
-    -Xmx%(job_memory)s %(vcf)s
-    %(infile)s > %(outfile)s 2> %(outfile)s.log;'''
-    if bgzip:
-        statement +=  '''bgzip %(outfile)s;
-        tabix -p vcf %(outfile)s.gz'''
-    P.run(statement)
-
-
-##############################################################################
-
-
 def selectVariants(infile, outfile, genome, select):
     '''Filter de novo variants based on provided expression'''
     statement = '''gatk SelectVariants

From 9342ccb719a3b7b4f6c18c1b2ce89c45e39c56be Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Sat, 30 Mar 2024 14:00:16 +0000
Subject: [PATCH 35/43] further development

---
 cgatpipelines/Rtools/diffexpression.R   |  4 ++--
 cgatpipelines/Rtools/exploratory.R      | 28 +++++++++++++++----------
 cgatpipelines/tasks/bamstats.py         |  2 +-
 cgatpipelines/tools/pipeline_mapping.py |  2 +-
 4 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/cgatpipelines/Rtools/diffexpression.R b/cgatpipelines/Rtools/diffexpression.R
index 20fda35ff..e5c279b2d 100644
--- a/cgatpipelines/Rtools/diffexpression.R
+++ b/cgatpipelines/Rtools/diffexpression.R
@@ -22,11 +22,11 @@ suppressMessages(library(goseq))
 source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 
 
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id", 'chromosome_name',
                    'start_position', 'end_position'),
     values= values,
     mart= mart,
diff --git a/cgatpipelines/Rtools/exploratory.R b/cgatpipelines/Rtools/exploratory.R
index 72a7b2765..20aa2b7bc 100755
--- a/cgatpipelines/Rtools/exploratory.R
+++ b/cgatpipelines/Rtools/exploratory.R
@@ -33,11 +33,11 @@ source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
 # Dependencies: biomaRt package
 # Input: vector of ensembl gene ids
 # Output: data frame with description, symbol and entrez gene
-mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "https://aug2020.archive.ensembl.org")
 getmart <- function(values){
   data<- getBM(
     filters= "ensembl_gene_id", 
-    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene_id"),
     values= values,
     mart= mart,
     useCache = FALSE)
@@ -77,13 +77,12 @@ run <- function(opt) {
 
   ### TRANSFORMATION OF DATA ###
   futile.logger::flog.info(paste("Transforming data"))
-  rld<- rlog(dds)
+  dds <- estimateSizeFactors(dds)
   vsd<- vst(dds)
   df <- bind_rows(
     as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
       mutate(transformation = "log2(x + 1)"),
-    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
-    as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
+    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"))
   colnames(df)[1:2] <- c("x", "y")
   start_plot('Variance_Transformations', outdir=opt$outdir)
     print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
@@ -101,11 +100,12 @@ run <- function(opt) {
     if(dim(pca$x)[1]>7){
       dim_pca = 8
     } else {
-      dim_pca = dim(pca$x)
+      dim_pca = dim(pca$x)[1]
     }
     scores <- data.frame(variable.group, pca$x[,1:dim_pca])
     start_plot(paste0('PCA_', factor), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group))) +
+	      geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
@@ -114,7 +114,8 @@ run <- function(opt) {
         theme(text=element_text(family='serif')))
     end_plot()
     start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
-          print(qplot(x=PC1, y=PC3, data=scores, colour=factor(variable.group)) +
+          print(ggplot(scores, aes(x=PC1, y=PC3, colour=factor(variable.group))) +
+          geom_point() +
         theme(legend.position="right") +  
         labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
              y=paste0("PC3 (", percentVar[3],"% of variance)")) + 
@@ -152,14 +153,18 @@ run <- function(opt) {
   df <- as.data.frame(colData(dds)[,opt$factors])
   rownames(df) <- colData(dds)$track
   # Heatmap of Top 20 Expressed Genes
-  select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+  selected <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
   start_plot('Heatmap_topExpressed', outdir=opt$outdir)
-    pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
+    pheatmap(assay(vsd)[selected,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
   end_plot()
   # Heatmap of Top 20 Variable Genes
   topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
+  futile.logger::flog.info(paste("topVarGenes            ",topVarGenes))
   mat <- assay(vsd)[ topVarGenes, ]
+  futile.logger::flog.info(paste("mat           ",rownames(mat)))
   temp <- getmart(rownames(mat))
+  temp <- temp[!duplicated(temp$ensembl_gene_id), ]
+  futile.logger::flog.info(paste("temp            ",temp))
   row.names(temp) <- temp$ensembl_gene_id
   rownames(mat) <- temp[rownames(mat),"external_gene_name"]
   start_plot('Heatmap_topVariable', outdir=opt$outdir)
@@ -221,7 +226,8 @@ run <- function(opt) {
       percentVar <- round(100 * summary(pca)$importance[2,])
       scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
       start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
-      print(qplot(x=PC1, y=PC2, data=scores, colour=factor(variable.group), shape=factor(sample.group)) +
+      print(ggplot(scores, aes(x=PC1, y=PC2, colour=factor(variable.group), shape=factor(sample.group))) + 
+            geom_point() +
             theme(legend.position="right") +  
             labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
                  y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
diff --git a/cgatpipelines/tasks/bamstats.py b/cgatpipelines/tasks/bamstats.py
index a31367a47..720e03fa7 100644
--- a/cgatpipelines/tasks/bamstats.py
+++ b/cgatpipelines/tasks/bamstats.py
@@ -1060,7 +1060,7 @@ def loadIdxstats(infiles, outfile):
                                       columns=(['region', 'mapped']))
 
         # reformat the df
-        df = df.append(reformatted_df, ignore_index=True)
+        df = df._append(reformatted_df, ignore_index=True)
         df.set_index('region', inplace=True)
         df1 = df[['mapped']].T
         dfs.append(df1)
diff --git a/cgatpipelines/tools/pipeline_mapping.py b/cgatpipelines/tools/pipeline_mapping.py
index 90f019a8c..3d6445674 100644
--- a/cgatpipelines/tools/pipeline_mapping.py
+++ b/cgatpipelines/tools/pipeline_mapping.py
@@ -1980,7 +1980,7 @@ def mergeBAMFiles(infiles, outfile):
 
         infiles = " ".join(infiles)
         statement = '''
-        samtools merge %(outfile)s %(infiles)s >& %(outfile)s.log &&
+        samtools merge -f %(outfile)s %(infiles)s >& %(outfile)s.log &&
         samtools index %(outfile)s
         '''
         P.run(statement)

From a78a14996c7240156037b36fb3c9e4cc581be2b7 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Sat, 30 Mar 2024 14:06:41 +0000
Subject: [PATCH 36/43] addition of setup instruction for R tools

---
 MANIFEST.in | 3 +++
 setup.py    | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 99ff29cfa..44c74b012 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -22,5 +22,8 @@ exclude scripts/*
 include scripts/__init__.py
 include scripts/version.py
 include scripts/cgat_ruffus_profile.py
+include cgatpipelines/Rtools/*
+include cgatpipelines/experiment.R
+
 
 # extensions
diff --git a/setup.py b/setup.py
index a1c95e7de..ec544f9ab 100644
--- a/setup.py
+++ b/setup.py
@@ -70,7 +70,7 @@
     raise SystemExit("""CGAT requires Python 3 or later.""")
 
 cgat_packages = find_packages()
-cgat_package_dirs = {'cgatpipelines': 'cgatpipelines'}
+cgat_package_dirs = {'cgatpipelines': 'cgatpipelines', 'Rtools': 'Rtools'}
 
 # Classifiers
 classifiers = """

From 5ed403960934ba5ed14e51ee975ce510859e95fe Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Sat, 30 Mar 2024 14:07:34 +0000
Subject: [PATCH 37/43] draft long read pipeline

---
 cgatpipelines/tools/pipeline_longread.py | 180 +++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 cgatpipelines/tools/pipeline_longread.py

diff --git a/cgatpipelines/tools/pipeline_longread.py b/cgatpipelines/tools/pipeline_longread.py
new file mode 100644
index 000000000..89f54af05
--- /dev/null
+++ b/cgatpipelines/tools/pipeline_longread.py
@@ -0,0 +1,180 @@
+"""===========================
+Pipeline template
+===========================
+
+.. Replace the documentation below with your own description of the
+   pipeline's purpose
+
+Overview
+========
+
+This pipeline computes the word frequencies in the configuration
+files :file:``pipeline.yml` and :file:`conf.py`.
+
+Usage
+=====
+
+See :ref:`PipelineSettingUp` and :ref:`PipelineRunning` on general
+information how to use cgat pipelines.
+
+Configuration
+-------------
+
+The pipeline requires a configured :file:`pipeline.yml` file.
+cgatReport report requires a :file:`conf.py` and optionally a
+:file:`cgatreport.yml` file (see :ref:`PipelineReporting`).
+
+Default configuration files can be generated by executing:
+
+   python <srcdir>/pipeline_@template@.py config
+
+Input files
+-----------
+
+None required except the pipeline configuration files.
+
+Requirements
+------------
+
+The pipeline requires the results from
+:doc:`pipeline_annotations`. Set the configuration variable
+:py:data:`annotations_database` and :py:data:`annotations_dir`.
+
+Pipeline output
+===============
+
+.. Describe output files of the pipeline here
+
+Glossary
+========
+
+.. glossary::
+
+
+Code
+====
+
+"""
+from ruffus import *
+import sys
+import os
+import glob
+import cgatcore.experiment as E
+from cgatcore import pipeline as P
+
+
+# load options from the config file
+PARAMS = P.get_parameters(
+    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
+     "../pipeline.yml",
+     "pipeline.yml"])
+
+NPUT_FORMATS = ("*.fastq.1.gz", "*.fastq.gz", "*.sra", "*.csfasta.gz")
+REGEX_FORMATS = regex(r"(\S+).(fastq.1.gz|fastq.gz|sra|csfasta.gz)")
+
+matches = glob.glob("*.fastq.1.gz") + glob.glob(
+    "*.fastq.gz") + glob.glob("*.sra") + glob.glob("*.csfasta.gz")
+
+PICARD_MEMORY = PARAMS["picard_memory"]
+GATK_MEMORY = PARAMS["gatk_memory"]
+ANNOTATION_MEMORY = PARAMS["annotation_memory"]
+
+PANELS = glob.glob("*.panel.tsv")
+
+FILTERS = glob.glob ("*.filter.tsv")
+
+
+# ---------------------------------------------------
+# Specific pipeline tasks
+@mkdir("results.dir")
+@transform(("/ceph/project/talbotlab/jscaber/nanopore/"),
+           regex("(.*)"),
+           r"/ceph/project/talbotlab/jscaber/nanopore/results.dir")
+def nextflowEpi2me(infolder, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    target_regions = PARAMS["targets"]
+
+    # the command line statement we want to execute
+    statement = '''nextflow run epi2me-labs/wf-cas9
+        --fastq %(infolder)s 
+        --ref_genome %(genome_file)s
+        --targets %(target_regions)s 
+        --out_dir %(outfile)s
+        --full_report -profile singularity
+    
+    '''
+    P.run(statement)
+
+
+@mkdir("last.dir")
+@transform(("nextflow/fastq_pass/fastq_pass_ontarget.fastq"),
+           regex("(.*)"),
+           r"last.dir/fastq_pass_on_target.par")
+def lastTrain(infile, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+
+    # the command line statement we want to execute
+    statement = '''last-train
+        -P10 -Q0 %(last_index)s %(infile)s > %(outfile)s
+    '''
+    P.run(statement)
+
+@mkdir("last.dir")
+@transform(lastTrain,
+           regex("(.*).par"),
+           add_inputs(r"nextflow/fastq_pass/fastq_pass_ontarget.fastq"),
+           r"\1.maf")
+def lastAl(infiles, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    infile1,infile2 = infiles
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+
+    # the command line statement we want to execute
+    statement = '''lastal -P10 --split -p 
+        %(infile1)s %(last_index)s %(infile2)s > %(outfile)s
+    '''
+    P.run(statement)
+
+
+@transform(lastAl,
+           regex("(.*).maf"),
+           r"\1.txt")
+def tandemGenotypes(infile, outfile):
+    '''count the number of words in the pipeline configuration files.'''
+
+    genome_file = os.path.abspath(
+        os.path.join(PARAMS["genome_dir"], PARAMS["genome"] + ".fa"))
+    last_index = PARAMS["LAST_index"]
+    repeat_mask = PARAMS["repeat_mask"]
+
+
+    # the command line statement we want to execute
+    statement = '''tandem-genotypes -g target_regions.bed %(repeat_mask)s %(infile)s > %(outfile)s
+    '''
+    P.run(statement)
+
+# ---------------------------------------------------
+# Generic pipeline tasks
+@follows(nextflowEpi2me)
+def full():
+    pass
+
+
+def main(argv=None):
+    if argv is None:
+        argv = sys.argv
+    P.main(argv)
+
+
+if __name__ == "__main__":
+    sys.exit(P.main(sys.argv))

From fccb2f10186c34ded8befdedb8d901ee40efb4b6 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 Dec 2024 18:11:57 +0000
Subject: [PATCH 38/43] Cchanges to cgatflow.py, compatibility with python 3.12

---
 cgatpipelines/cgatflow.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/cgatpipelines/cgatflow.py b/cgatpipelines/cgatflow.py
index 0bf22afe4..374a21180 100644
--- a/cgatpipelines/cgatflow.py
+++ b/cgatpipelines/cgatflow.py
@@ -21,7 +21,7 @@
 import sys
 import re
 import glob
-import imp
+import importlib.util
 import cgatpipelines
 
 
@@ -84,9 +84,10 @@ def main(argv=None):
     # remove 'cgatflow' from sys.argv
     del sys.argv[0]
 
-    (file, pathname, description) = imp.find_module(pipeline, paths)
-
-    module = imp.load_module(pipeline, file, pathname, description)
+    spec = importlib.util.spec_from_file_location(pipeline, path+"/"+pipeline+".py")
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[pipeline] = module
+    spec.loader.exec_module(module)
 
     module.main(sys.argv)
 

From e6275ba54a11c43061ca5b8e3f373719410c5ff0 Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 Dec 2024 18:13:13 +0000
Subject: [PATCH 39/43] remove rpy2

---
 conda/environments/cgat-flow.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/conda/environments/cgat-flow.yml b/conda/environments/cgat-flow.yml
index 2224fcb59..e465dfba5 100644
--- a/conda/environments/cgat-flow.yml
+++ b/conda/environments/cgat-flow.yml
@@ -14,7 +14,6 @@ channels:
 dependencies:
 # python dependencies
 - python >= 3.6
-- rpy2 <= 2.9.4
 - tzlocal
 
 # exome.py

From 32cac1e7686ae6fff19713056b3b9b2d3f57deeb Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 Dec 2024 18:15:32 +0000
Subject: [PATCH 40/43] add R scripts to new directory

---
 cgatpipelines/tasks/diffexonexpression.R | 269 +++++++++++++++++++++++
 cgatpipelines/tasks/filtercounts.R       | 250 +++++++++++++++++++++
 cgatpipelines/tasks/script_template.R    |  66 ++++++
 cgatpipelines/tasks/venn.R               | 132 +++++++++++
 4 files changed, 717 insertions(+)
 create mode 100644 cgatpipelines/tasks/diffexonexpression.R
 create mode 100755 cgatpipelines/tasks/filtercounts.R
 create mode 100644 cgatpipelines/tasks/script_template.R
 create mode 100644 cgatpipelines/tasks/venn.R

diff --git a/cgatpipelines/tasks/diffexonexpression.R b/cgatpipelines/tasks/diffexonexpression.R
new file mode 100644
index 000000000..8cbfe9a8b
--- /dev/null
+++ b/cgatpipelines/tasks/diffexonexpression.R
@@ -0,0 +1,269 @@
+#' Running DEXSeq
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/diffexonexpression.R --rds-filename=experiment.rds --model=~sample+exon+group:exon --reducedmodel=~sample+exon
+#'
+#' `experiment.rds` is a DEXSeq experiment object after filtering
+#'
+#'
+#' -> todo: ideas
+
+
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(fgsea))
+suppressMessages(library(data.table))
+suppressMessages(library(sva))	
+suppressMessages(library(biomaRt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(RUVSeq))
+suppressMessages(library(Cairo))
+suppressMessages(library(goseq))
+suppressMessages(library(DEXSeq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    values= values,
+    mart= mart)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+start_plot <- function(section, outdir = "", height = 6, width = 6, type = "png") {
+  file = get_output_filename(paste0(outdir,"/",section, ".", type))
+  Cairo(file = file,
+        type = type,
+        width = width,
+        height = height,
+        units="in",
+        dpi = 300,
+        bg = "white")
+  #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+  dev.off()
+}
+
+
+# plotTPMs function
+# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot
+# Input: data frame
+# Output: plot object
+plotTPMs <- function(dftemp, contrast_name){
+  dftemp %>% 
+    gather(key = "var", value="value", -contrast, -track) %>% 
+    mutate(var = factor(var, levels=unique(var))) %>%
+    ggplot(aes(x = contrast, y = value, color = contrast)) +
+    geom_point(position = position_jitter(w = 0.15, h = 0)) +
+    facet_wrap(~ var, scales = "free") + theme_bw() +
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")
+}
+
+
+run <- function(opt) {
+  
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  if (class(experiment) != "DEXSeqDataSet"){
+    stop("This script supports DEXSeqDataSet objects only.")
+  }
+  flog.info("Running DEXSeq")
+  dxd = estimateSizeFactors(experiment)
+  dxd = estimateDispersions(dxd)
+  dxd = testForDEU(dxd, reducedModel = formula(opt$reducedmodel),
+                   fullModel =  formula(opt$model))
+  dxd = estimateExonFoldChanges( dxd, fitExpToVar=opt$contrast)
+  res = DEXSeqResults(dxd)
+  
+  flog.info("... plotting dispersion estimates")
+  ## Plot dispersion estimates
+  start_plot("Dispersion", opt$outdir)
+  plotDispEsts(dxd)
+  end_plot()
+  
+  flog.info("... plotting MA")
+  ## MA Plot
+  start_plot("MAPlot", opt$outdir)
+  plotMA(res, ylim = c(-3,3))
+  end_plot()
+  
+  flog.info("... saving DE data")
+  ## Save DE data
+  #resSig <- subset(res, padj < opt$alpha)
+  #data <- getmart(as.character(map(res$transcripts, 1)))
+  #res$symbol<-data$external_gene_name[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))]
+  #res$desc<-data$description[match(data$ensembl_transcript_id, as.character(map(res$transcripts, 1)))]
+  resSig <- subset(res, padj < opt$alpha)
+  write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t")
+  write.table(res, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t")
+  
+  flog.info("... plotting P histogram")
+  ## Plot P value Histogram
+  start_plot("PHistogram", opt$outdir)
+  hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram")
+  end_plot()
+  
+  flog.info("... plotting downregulated genes")    
+  ## Plot Top Downregulated Genes
+  genelist <- rownames(res[ order( res[,grepl("log2",colnames(res))] ), ][0:9,])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Downregulated_",gene), opt$outdir)
+      plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  
+  flog.info("... plotting upregulated genes")
+  genelist <- rownames(res[ order( -res[,grepl("log2",colnames(res))] ), ][0:9,])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Upregulated_",gene), opt$outdir)
+    plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  flog.info("... plotting significant genes")
+  genelist <- rownames(resSig[order(resSig$padj), ])
+  for(gene in unique(substr(genelist,0,15))){
+    start_plot(paste0("Significant_",gene), opt$outdir)
+    plotDEXSeq(res, gene, legend=TRUE, cex.axis=1.2, cex=1.3, lwd=2 ,fitExpToVar = opt$contrast)
+    end_plot()
+  }
+
+  if(opt$perm >0){
+    flog.info("... performing permutations")
+    designperm <- colData(dxd)
+    dxdperm = dxd
+    x = 0
+    i = 1
+    y= list()
+    while(i <= opt$perm) {
+      flog.info(paste0("...... Permutation ", i))
+      designperm[, opt$contrast] = sample(designperm[, opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dxd)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        next
+      }
+      colData(dxdperm) <- designperm
+      dxdperm = testForDEU(dxdperm)
+      dxdperm = estimateExonFoldChanges( dxdperm, fitExpToVar=opt$contrast)
+      resperm = DEXSeqResults(dxdperm)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
+      y[[i]] = dxdperm$group
+      i = i+1
+    }
+    flog.info(paste0("Result:   "  , x))
+    start_plot("Simulations", outdir=opt$outdir)
+    theme_set(theme_gray(base_size = 18))
+    sims = qplot(x,
+                 geom="histogram",
+                 breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
+                 main = "Histogram of DE experiments\n with random group labels", 
+                 xlab = "Number of differentially expressed exons",
+                 ylab = "Number of simulations") +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
+      theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
+    print(sims)
+    end_plot()
+    flog.info(paste0("... Permutation p value: ",
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
+    z <- list()
+    for(i in 0:length(x)){
+      z[i] <- paste( unlist(y[i]), collapse=' ')
+    }
+    z <- unlist(z)
+    df <- data.frame(number=x, combination=z)
+    df
+    write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
+  }
+}
+
+main <- function() {
+  
+  option_list <- list(
+    make_option(
+      "--rds-filename",
+      dest = "rds_filename",
+      type = "character",
+      default = "experiment.rds",
+      help = paste("filename with input data of counts")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~ sample + exon + group:exon",
+      help = paste("model formula for GLM")
+    ),
+    make_option(
+      "--reducedmodel",
+      dest = "reducedmodel",
+      type = "character",
+      default = "~ sample + exon",
+      help = paste("model formula for GLM")
+    ),
+    make_option(
+      "--contrast",
+      dest = "contrast",
+      type = "character",
+      default = "group",
+      help = paste("contrast/factor to use for comparison")
+    ),
+    make_option(
+      "--refgroup",
+      dest = "refgroup",
+      type = "character",
+      default = "CTR",
+      help = paste("reference group, e.g. CTR")
+    ),
+    make_option(
+      "--alpha",
+      dest = "alpha",
+      type = "numeric",
+      default = 0.05,
+      help = paste("Adjusted P value threshold")
+    ),
+   make_option(
+        "--outdir",
+        dest = "outdir",
+        type = "character",
+        default = "experiment",
+        help = paste("Libraries for gsea")
+    ),
+    make_option(
+      "--permute",
+      dest = "perm",
+      type = "integer",
+      default = 0,
+      help = paste("number of permutations")
+    ),
+    make_option(
+      "--pathways",
+      dest = "pathways",
+      type = "character",
+      default = "",
+      help = paste("Libraries for gsea")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$pathways)) {
+    opt$pathways = unlist(strsplit(opt$pathways, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/filtercounts.R b/cgatpipelines/tasks/filtercounts.R
new file mode 100755
index 000000000..6f8d7ebd7
--- /dev/null
+++ b/cgatpipelines/tasks/filtercounts.R
@@ -0,0 +1,250 @@
+#' Basic filtering analysis Script
+#'
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/filtercounts.R 
+#' 
+#' input: directory containing read count files or tsv file containing reads
+#' additional input variables: method used to generate file, model
+#' output: `experiment_out.rds` is an experiment object after filtering
+#' 
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(data.table))
+suppressMessages(library(DESeq2))
+suppressMessages(library(edgeR))
+suppressMessages(library(csaw))
+suppressMessages(library(rhdf5))
+suppressMessages(library(tximport))
+suppressMessages(library(DEXSeq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+run <- function(opt) {
+  
+  ### READING DATA ###
+  # Read in sampleData Table
+  futile.logger::flog.info(paste("reading sampleData table from", normalizePath(opt$sampleData)))
+  sampleData <- read_tsv(opt$sampleData)
+  sampleData <-sampleData[sampleData$include ==1, ]
+  futile.logger::flog.info(paste("read sampleData ", paste(dim(sampleData), collapse = ",")))
+  futile.logger::flog.info(paste(sampleData))
+  rownames(sampleData) <- sampleData$track
+
+  futile.logger::flog.info(paste("reading in data from ", opt$source))
+  if(opt$source %in% c("salmon", "kallisto")){
+    # Read in Transcript to Gene Map
+    tx2gene <- read_tsv(opt$tx2gene)
+    colnames(tx2gene) <- c("ensembl_transcript_id", "ensembl_gene_id")
+    if(opt$tx2gene_regex != "None"){
+      tx2gene <- filter(tx2gene, !grepl(opt$tx2gene_regex,ensembl_transcript_id))
+    }
+    # Read in Data
+    futile.logger::flog.info(opt$counts_dir)
+    futile.logger::flog.info(sampleData$track)
+    files <- file.path(opt$counts_dir, sampleData$track, "quant.sf")
+    names(files) <- sampleData$track
+    txi <- tximport(files, type = opt$source, tx2gene = tx2gene)
+    if(opt$method == "deseq2"){
+      dataset <- DESeqDataSetFromTximport(txi, colData = sampleData, design = formula(opt$model))
+    } else if(opt$method == "edger"){
+      cts <- txi$counts
+      normMat <- txi$length
+      normMat <- normMat/exp(rowMeans(log(normMat)))
+      normCts <- cts/normMat
+      normMat <- sweep(normMat, 2, eff.lib, "*")
+      normMat <- log(normMat)
+      dataset <- DGEList(counts=cts, 
+                   samples=sampleData)
+      dataset <- scaleOffset(dataset,normMat)
+    } else if(opt$method == "Sleuth"){
+      stop("Sleuth method not yet implemented. Sorry.")
+      
+    } else{
+      stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"")
+    }
+  }
+  
+  if(opt$source == "dexseq"){
+    # Read in Data
+    files <- file.path(opt$counts_dir, paste0(sampleData$track, ".txt"))
+    names(files) <- sampleData$track
+    if(opt$method != "dexseq"){
+      stop("DEXSeq input is handled by diffexonexpression. Please correct the method argument.")
+    }
+    dataset = DEXSeqDataSetFromHTSeq(
+      files,
+      sampleData=sampleData,
+      design=formula(opt$model),
+      flattenedfile=opt$flattenedFile)
+  } else if(opt$source == "counts_table"){
+    # Read in Data
+    raw <- read.table(file = gzfile(opt$counts_tsv), header=TRUE, row.name=1, check.names = FALSE)
+    futile.logger::flog.info(paste("Colnames: ", colnames(raw)))
+    experiment_tsv <- raw[,sampleData$track,drop=FALSE]
+    if(opt$method == "deseq2"){
+      dataset = DESeqDataSetFromMatrix(experiment_tsv, sampleData, design = formula(opt$model))
+    } else if(opt$method == "edger"){
+      dataset <- DGEList(counts=experiment_tsv, samples=sampleData)
+    } else if(opt$method == "sleuth"){
+      stop("Sleuth method not yet implemented. Sorry.")
+    } else{
+      stop("Method not defined. Allowable methods are \"DESeq2\", \"EdgeR\" or \"Sleuth\"")
+    }
+  }
+  
+  ### FILTERING ###
+  if(opt$filter == TRUE) {
+    futile.logger::flog.info(paste("filtering data ", opt$source))
+    if(opt$method == "edger"){
+      futile.logger::flog.info(paste("Counts before filtering ", paste(dim(dataset$counts), collapse = ",")))
+      keep <- filterByExpr(dataset)
+      dataset <- dataset[keep, , keep.lib.sizes=FALSE]
+      counts_table <- dataset$counts
+      futile.logger::flog.info(paste("Counts after filtering ", paste(dim(dataset$counts), collapse = ",")))
+    } else if(opt$method == "deseq2"){
+      futile.logger::flog.info(paste("Counts before filtering ", paste(dim(counts(dataset)), collapse = ",")))
+      keep <- rowSums(counts(dataset)) >= 10
+      dataset <- dataset[keep,]
+      #dataset <- dataset[Reduce('|', dataset[sapply(dataset, is.numeric)]),]
+      #dataset <- dataset[rowSums(dataset[,sapply(dataset, is.numeric)]==0)<6,]
+      #dataset <- subset(dataset, !rowSums(assay(dataset) == 0) > 6)
+      counts_table <- counts(dataset)
+      futile.logger::flog.info(paste("Counts after filtering ", paste(dim(counts(dataset)), collapse = ",")))
+    } else if(opt$method == "dexseq"){
+      futile.logger::flog.info(paste("Filtering for DEXSeq not implemented "))
+      counts_table <- counts(dataset)
+    }
+  }
+  else {
+    futile.logger::flog.info(paste("No filtering on dataset performed.", opt$source))
+  }
+
+  ### SAVING DATA ###
+  file = get_output_filename(paste0(opt$outdir,"/experiment_out.rds"))
+  flog.info(paste("saving experiment data to", file))
+  saveRDS(dataset, file = file)
+  
+  ## Set up gene lengths for RPKM
+  flog.info("outputting counts data")
+  write.table(counts(dataset),
+              file = paste0(opt$outdir,"/Counts_Data.tsv"),
+              sep = "\t",
+              quote = FALSE,
+              row.names = TRUE,
+              col.names = NA)
+  if(opt$source %in% c("salmon", "kallisto")){
+    write.table(txi$abundance,
+                file = paste0(opt$outdir,"/tpm.tsv"),
+                sep = "\t",
+                quote = FALSE,
+                row.names = TRUE,
+                col.names = NA)
+  }
+}
+
+  
+main <- function() {
+  option_list <- list(
+    make_option(
+      "--counts-dir",
+      dest="counts_dir",
+      type="character",
+      help=paste("directory containing expression estimates",
+                 "from salmon/kallisto/DEXSeq.")
+    ),
+    make_option(
+      "--counts-tsv",
+      dest="counts_tsv",
+      type="character",
+      help=paste("file containing counts generated",
+                 "by e.g. featurecounts.")
+    ),
+    make_option(
+      c("-d", "--sampleData"),
+      dest="sampleData",
+      type="character",
+      default = "",
+      help=paste("input file with experimental design/sample info")
+    ),
+    make_option(
+      c("--outdir"),
+      dest="outdir",
+      type="character",
+      default = "results.dir",
+      help=paste("output directory")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      c("-s", "--source"),
+      dest="source", 
+      type="character",
+      default="salmon",
+      help=paste("Source of data. Possible options are ",
+                 "\"salmon\", \"kallisto\", \"counts_table\", \"dexseq\"")
+    ),
+    make_option(
+      c("--tx2gene"),
+      dest="tx2gene", 
+      type="character",
+      default="transcript2geneMap.tsv",
+      help=paste("Path to transcript to gene tsv.")
+    ),
+    make_option(
+      c("--tx2gene_regex"),
+      dest="tx2gene_regex", 
+      type="character",
+      default="None",
+      help=paste("Regex/Prefix for removal of certain features from ",
+                 "experiment (e.g. removal of spike-ins)")
+    ),  
+    make_option(
+      "--method",
+      dest="method", 
+      type="character",
+      default="deseq2",
+      help=paste("differential expression method to apply ")
+    ),
+    make_option(
+      "--filter",
+      dest="filter",
+      type="logical",
+      default=TRUE,
+      help=paste("adopt filtering strategy. ",
+                 "For EDGER, the default strategy is applied. ",
+                 "For DESeq2 basic rowsum filtering < 10 is applied.")
+    ),
+    make_option(
+      "--dexseq-flattened-file",
+      dest="flattenedFile",
+      type="character",
+      help=paste("directory containing flat gtf for dexseq. DEXSeq ",
+                 "expects this to be generated by the",
+                 "DEXSeq_prepare_annotations.py script")
+    )
+  )
+   
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  if (!is.null(opt$method)) {
+    opt$method = str_to_lower(opt$method)
+  }
+  
+  if (!is.null(opt$source)) {
+    opt$source = str_to_lower(opt$source)
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/script_template.R b/cgatpipelines/tasks/script_template.R
new file mode 100644
index 000000000..84ee66d8d
--- /dev/null
+++ b/cgatpipelines/tasks/script_template.R
@@ -0,0 +1,66 @@
+#' filtering single cell data based on QC metrics
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
+#'
+#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells,
+#' that is the genes are in rows and the columns are cells.
+#'
+#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures,
+#' that is rows are cells and features are in columns. The table should contain
+#' a column called `sample_id` that will match the columns in the table
+#' `feature_counts.tsv`.
+#'
+#' Features can then be selected in the `--factor` option to be
+#' plotted.
+#'
+#' -> todo: parameterize detection of ERCC (pattern?)
+#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse.
+
+## conda dependencies: bioconductor-scater r-cairo
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+run <- function(opt) {
+
+    ## do something
+    
+}
+
+main <- function() {
+
+    option_list <- list(
+        make_option(
+            "--counts-filename",
+            dest = "counts_filename",
+            type = "character",
+            default = "featurecounts.tsv",
+            help = paste("filename with input data of counts")
+        ),
+        make_option(
+            "--factor",
+            dest = "factors",
+            type = "character",
+            default = "group,collection_date",
+            help = paste("factors to colour QC plots by.")
+        )
+    )
+    opt <- experiment_start(option_list = option_list,
+                            description = description)
+
+    if (!is.null(opt$factors)) {
+        opt$factors = unlist(strsplit(opt$factors, ","))
+    }
+    run(opt)
+    
+    experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/venn.R b/cgatpipelines/tasks/venn.R
new file mode 100644
index 000000000..16f3ca19e
--- /dev/null
+++ b/cgatpipelines/tasks/venn.R
@@ -0,0 +1,132 @@
+#' Wrapper for VennDiagram with basic customization and command line functionality
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' cgat sc-counts2counts --counts-filename=featurecounts.tsv --phenotypes-filename=phenodata.tsv --factor=group,mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint > filtered_counts.tsv
+#'
+#' `feature_counts.tsv` is a table (tab-separated) of ngenes x ncells,
+#' that is the genes are in rows and the columns are cells.
+#'
+#' `phenodata.tsv` is a table (tab-separated) of ncells x nfeatures,
+#' that is rows are cells and features are in columns. The table should contain
+#' a column called `sample_id` that will match the columns in the table
+#' `feature_counts.tsv`.
+#'
+#' Features can then be selected in the `--factor` option to be
+#' plotted.
+#'
+#' -> todo: parameterize detection of ERCC (pattern?)
+#' -> todo: parameterize definition of mitochondrial genes - currently hardcoded for mouse.
+
+## conda dependencies: bioconductor-scater r-cairo
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(VennDiagram))
+
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+run <- function(opt) {
+  
+  dataframes <- lapply(opt$filenames, read.table)
+  names(dataframes) <- opt$names
+  x1 <- lapply(dataframes, function(x) subset(x, padj < opt$pvalue))
+  x <- lapply(x1, function(x) subset(x, abs(log2FoldChange) >= opt$lfc))
+  if(opt$sign == "positive"){
+    x <- lapply(x, function(y) subset(y, log2FoldChange >0))}
+  if(opt$sign == "negative"){
+    x <- lapply(x, function(y) subset(y, log2FoldChange <0))}
+  x <- lapply(x, rownames)
+  flog.info(str(x))
+  lapply(calculate.overlap(x), write, "venn.txt", append=TRUE, ncolumns=1000)
+
+
+  venn.diagram(x,
+               filename = 'venn_diagramm.png',
+               output = TRUE ,
+               imagetype="png" ,
+               height = 480 ,
+               width = 480 ,
+               resolution = 300,
+               compression = "lzw",
+               lwd = 2,
+               lty = 'blank',
+               fill = opt$colours,
+               cex = 0.8,
+               fontface = "bold",
+               fontfamily = "sans",
+               cat.cex = 0.4,
+               cat.fontface = "bold",
+               cat.default.pos = "outer",
+               cat.fontfamily = "sans"
+  )
+}
+
+main <- function() {
+  
+  option_list <- list(
+    make_option(
+      "--filenames",
+      dest = "filenames",
+      type = "character",
+      default = "file1.tsv,file2.tsv",
+      help = paste("filenames with results data")
+    ),
+    make_option(
+      "--colours",
+      dest = "colours",
+      type = "character",
+      default = "blue,red",
+      help = paste("Venn Diagram Colours")
+    ),
+    make_option(
+      "--names",
+      dest = "names",
+      type = "character",
+      default = "A,B",
+      help = paste("Names")
+    ),
+    make_option(
+      "--pvalue",
+      dest = "pvalue",
+      type = "numeric",
+      default = 0.05,
+      help = paste("p value threshold")
+    ),
+    make_option(
+      "--lfc",
+      dest = "lfc",
+      type = "numeric",
+      default = 0,
+      help = paste("LFC value threshold")
+    ),
+    make_option(
+      "--sign",
+      dest = "sign",
+      type = "character",
+      default = "",
+      help = paste("optional - possibilities are positive/negative")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$filenames)) {
+    opt$filenames = unlist(strsplit(opt$filenames, ","))
+  }
+  if (!is.null(opt$colours)) {
+    opt$colours = unlist(strsplit(opt$colours, ","))
+  }
+  if (!is.null(opt$names)) {
+    opt$names = unlist(strsplit(opt$names, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()

From 00dffe9de5c5b21871be009db75747d9ae01cbee Mon Sep 17 00:00:00 2001
From: jscaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Wed, 11 Dec 2024 18:16:11 +0000
Subject: [PATCH 41/43] add further R scripts, moved

---
 cgatpipelines/tasks/diffexpression.R | 397 +++++++++++++++++++++++++++
 cgatpipelines/tasks/exploratory.R    | 302 ++++++++++++++++++++
 2 files changed, 699 insertions(+)
 create mode 100644 cgatpipelines/tasks/diffexpression.R
 create mode 100755 cgatpipelines/tasks/exploratory.R

diff --git a/cgatpipelines/tasks/diffexpression.R b/cgatpipelines/tasks/diffexpression.R
new file mode 100644
index 000000000..20fda35ff
--- /dev/null
+++ b/cgatpipelines/tasks/diffexpression.R
@@ -0,0 +1,397 @@
+#' Differential expression analysis
+#'
+#' WARNING: This script is work-in-progress
+#' 
+#' Example usage:
+#' 
+#' Rscript PATH/TO/diffexpression.R --rds-filename=sce.rds --model=~group --coef=group_MUT_vs_CTR --refgroup=CTR
+#'
+
+
+suppressMessages(library(futile.logger))
+suppressMessages(library(getopt))
+suppressMessages(library(fgsea))
+suppressMessages(library(data.table))
+suppressMessages(library(sva))	
+suppressMessages(library(biomaRt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(RUVSeq))
+suppressMessages(library(Cairo))
+suppressMessages(library(goseq))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene", 'chromosome_name',
+                   'start_position', 'end_position'),
+    values= values,
+    mart= mart,
+    useCache = FALSE)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+start_plot <- function(section, height = 10, width = 10, type = "png", outdir="") {
+    file = get_output_filename(paste0(outdir,"/",section, ".", type))
+    Cairo(file = file,
+          type = type,
+          width = width,
+          height = height,
+          units="in",
+          dpi = 300,
+          bg = "white")
+    #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+    dev.off()
+}
+
+# makeTPMtable
+# Function: combines data from a genelist with abundance data to create a 
+# Input: 1. genelist vector in ENSEMBL format, 2. abundance matrix from tximport
+# Output: data frame with gene symbol names
+makeTPMtable  <- function(genelist, abundance, design, contrast){
+  genelist.df <- getmart(genelist)
+  genelist.df <- genelist.df[!duplicated(genelist.df[,1]),]
+  rownames(genelist.df) <- genelist.df$ensembl_gene_id
+  genelist.names <- genelist.df[genelist,]$external_gene_name
+  genelist.names[is.na(genelist.names)] <- genelist[is.na(genelist.names)]
+  dftemp <- as_tibble(t(abundance[genelist,]), rownames = "track")
+  dftemp <- dftemp %>% rename_at(vars(genelist), ~ genelist.names)
+  dftemp$contrast <- design[dftemp$track,][,contrast]
+  return(dftemp)
+}
+
+# plotTPMs function
+# Function: plots a table of TPMs from a data frame, ensuring appropriate formating for ggplot
+# Input: data frame
+# Output: plot object
+plotTPMs <- function(dftemp, contrast_name){
+  dftemp %>% 
+    gather(key = "var", value="value", -contrast, -track) %>% 
+    mutate(var = factor(var, levels=unique(var))) %>%
+    ggplot(aes(x = contrast, y = value, color = contrast)) +
+    geom_point(position = position_jitter(w = 0.15, h = 0)) +
+    facet_wrap(~ var, scales = "free") + theme_bw() +
+    ylab("normalised counts") + xlab (contrast_name) + guides(color = "none")+
+    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
+}
+
+
+run <- function(opt) {
+
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  futile.logger::flog.info(paste("Dataset", class(experiment)))
+  if (class(experiment) == "DESeqDataSet"){
+    flog.info("Running DESeq2")
+    suppressMessages(library(DESeq2))
+    colData(experiment)[,opt$contrast] <- relevel(colData(experiment)[,opt$contrast], ref = opt$refgroup)
+    design(experiment) <- formula(opt$model)
+    dds <- DESeq(experiment, betaPrior=FALSE)
+    futile.logger::flog.info(paste("coef", opt$coef))
+    futile.logger::flog.info(paste("coef", resultsNames(dds)))
+    res <- results(dds, name=opt$coef)
+    resLFC <- lfcShrink(dds, coef=opt$coef, type=opt$shrinkage)
+  } 
+  if (class(experiment) == "DGEList"){
+    dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
+  } 
+  if (class(experiment) == "DEXSeqDataSet"){
+    stop("please use diffexonexpression R script")
+  }
+
+  flog.info("... plotting dispersion estimates")
+  ## Plot dispersion estimates
+  start_plot("Dispersion", outdir=opt$outdir)
+    plotDispEsts(dds)
+  end_plot()
+  
+  flog.info("... plotting MA")
+  ## MA Plot
+  start_plot("MAPlot", outdir=opt$outdir)
+    DESeq2::plotMA(resLFC, ylim = c(ceiling(max(resLFC$log2FoldChange)),floor(min(resLFC$log2FoldChange))))
+  end_plot()
+  
+  flog.info("... saving DE data")
+  ## Save DE data
+  resSig <- subset(resLFC, padj < opt$alpha)
+  data <- getmart(rownames(resLFC))
+  resLFC$symbol<-data$external_gene_name[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$desc<-data$description[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$chromosome<-data$chromosome_name[match(rownames(resLFC), data$ensembl_gene_id)]
+  resLFC$start<-data$start_position[match(rownames(resLFC), data$ensembl_gene_id)]
+  resSig <- subset(resLFC, padj < opt$alpha)
+  write.table(resSig, paste0(opt$outdir,"/","results.tsv"), sep = "\t")
+  write.table(resLFC, paste0(opt$outdir,"/","results_full.tsv"), sep = "\t")
+  resSig2 <- subset(res, padj < opt$alpha)
+  write.table(resSig2, paste0(opt$outdir,"/","results_noLFCshrinkage.tsv"), sep = "\t")
+  resdf <- data.frame(geneid=rownames(resLFC), pvalue=resLFC$pvalue*sign(resLFC$log2FoldChange))
+  write.table(resdf, paste0(opt$outdir,"/","px_results_pvalue.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange)
+  write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  resdf <- data.frame(geneid=rownames(resLFC), l2fc=resLFC$log2FoldChange)
+  write.table(resdf, paste0(opt$outdir,"/","px_results_l2fc.gene.tsv"), sep = "\t", row.names = FALSE, quote = FALSE)
+  if(any(names(assays(experiment)) == "avgTxLength")){
+    genelengths <- rowMeans(assays(experiment)$avgTxLength)
+    write.table(genelengths, paste0(opt$outdir,"/","gene_lengths.tsv"), sep = "\t", row.names = TRUE, quote = FALSE)
+  }
+  file = get_output_filename(paste0(opt$outdir,"/","results_table.rds"))
+  flog.info(paste("saving results data to", file))
+  saveRDS(resLFC, file = file)
+
+
+  flog.info("... plotting P histogram")
+  ## Plot P value Histogram
+  start_plot("PHistogram", outdir=opt$outdir)
+    hist(res$pvalue,breaks=50, col='skyblue', xlab="p-value", main="P-value Histogram")
+  end_plot()
+
+  flog.info("... plotting downregulated genes")    
+  ## Plot Top Downregulated Genes
+  genelist <- rownames(resLFC[ order( resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,])
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Downregulated", outdir=opt$outdir)
+  print(plotTPMs(dftemp, opt$contrast))
+  end_plot()
+  
+  flog.info("... plotting upregulated genes")
+  genelist <- rownames(resLFC[ order( -resLFC[,grepl("log2",colnames(resLFC))] ), ][0:9,])
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Upregulated", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+  
+  flog.info("... plotting significant genes")
+  resSig <- subset(resLFC, padj < 0.05)
+  genelist <- rownames(resSig[order(resSig$padj), ])
+  if(length(genelist) > 9){
+      genelist <- genelist[0:9]}
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("significant", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+
+  flog.info("... plotting user-defined genes")
+  genelist <- unlist(strsplit(opt$userlist, ","))
+  genelist <- genelist[genelist %in% rownames(dds)]
+  dftemp <- makeTPMtable(genelist, counts(dds, normalized=TRUE), colData(dds), opt$contrast)
+  start_plot("Userdefined", outdir=opt$outdir)
+    print(plotTPMs(dftemp, opt$contrast))
+  dev.off()
+  
+  flog.info("... performing gene ontology (GO) enrichment analysis")
+  # Currently only supports data from salmon and kallisto
+  # Needs implementation of genelength matrix from featurecounts
+  if(any(names(assays(experiment)) == "avgTxLength")){
+      res.nona <- subset(resLFC, (!is.na(padj & baseMean) & baseMean > 1))
+      res.list <- list(as.integer(res.nona$padj < 0.05), 
+      	       	  as.integer(res.nona$log2FoldChange >0 & res.nona$padj < 0.05), 
+                  as.integer(res.nona$log2FoldChange <0 & res.nona$padj < 0.05))
+      res.names <- list("all", "up", "down")
+      for(i in 1:3) {
+      	    de.genes <- res.list[[i]]
+            names(de.genes) <- rownames(res.nona)
+            temp <- rowMeans(assays(experiment)$avgTxLength[match(names(de.genes), rownames(assays(experiment)$avgTxLength)),])
+      	    pwf=nullp(de.genes,bias.data=temp)
+      	    all = goseq(pwf,"hg38","ensGene", method="Hypergeometric")
+      	    sigall <- all
+      	    names(sigall) <- c("category","pvalue","underrepresented_pvalue","numberDE", "numberTOT", "term", "ontology")
+      	    sigall$pvalue <- p.adjust(sigall$pvalue, method="BH")
+      	    sigall$percent <- sigall$numberDE/sigall$numberTOT
+      	    sigall <- sigall[which(sigall[,2] < 0.05),]
+      	    cats <- sigall$category
+      	    write.table(sigall[,c("category", "pvalue")], file=paste0(opt$outdir,"/GO_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      	    write.table(sigall, paste0(opt$outdir,"/GO_annotated_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      	    write.table(all, paste0(opt$outdir,"/GO_complete_",res.names[[i]],".tsv"), quote=FALSE, sep='\t', row.names = FALSE)
+      }
+   }
+  
+  
+  flog.info("... performing gene set enrichment analysis (GSEA)")
+  resrnk<-resLFC
+  data <- getmart(rownames(resrnk))
+  resrnk$symbol <- data$external_gene_name[match(rownames(resrnk), data$ensembl_gene_id)]
+  resrnk$desc <-data$description[match(rownames(resrnk), data$ensembl_gene_id)]
+  resrnk$entrezgene <-data$entrezgene[match(rownames(resrnk), data$ensembl_gene_id)]
+  rnk.df <- as_tibble(resrnk[,c("entrezgene","log2FoldChange")]) %>% na.omit()
+  rnk <- rnk.df$log2FoldChange
+  names(rnk) <- rnk.df$entrezgene
+  rnk <- rnk[isUnique(names(rnk))]
+  if (!dir.exists(paste0(opt$outdir,"/gsea"))) {
+    dir.create(paste0(opt$outdir,"/gsea"))
+  }
+
+  for(pathway in opt$pathways){
+      pathways <- gmtPathways(pathway)
+      fgseaRes <- fgsea(pathways = pathways, 
+                        stats = rnk,
+                        minSize=15,
+                        maxSize=500,
+                        nperm=10000)
+      fwrite(fgseaRes, file=paste0(opt$outdir,"/gsea/",sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.tsv'), sep="\t", sep2=c("", " ", ""))
+      topPathwaysUp <- fgseaRes[ES > 0,][head(order(pval), n=10),]$pathway
+      png(paste0(opt$outdir,"/gsea/",'UP_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'),
+          width =15, height = 3, units = 'in', res = 600)
+      plotGseaTable(pathways[topPathwaysUp], rnk, fgseaRes, 
+                    gseaParam = 0.5, colwidths = c(10,2,1,1,1))
+      dev.off()
+  
+  
+      topPathwaysDown <- fgseaRes[ES < 0,][head(order(pval), n=10),]$pathway
+      plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, 
+      gseaParam = 0.5)
+      png(paste0(opt$outdir,"/gsea/",'DOWN_',sub("([^.]+)\\.[[:alnum:]]+$", "\\1", (basename(pathway))),'.png'),
+          width =15, height = 3, units = 'in', res = 600)
+      plotGseaTable(pathways[topPathwaysDown], rnk, fgseaRes, 
+                    gseaParam = 0.5, colwidths = c(10,2,1,1,1))
+      dev.off()
+  }
+  
+  if(opt$perm >0){
+    flog.info("... performing permutations")
+    designperm <- colData(dds)
+    ddsperm = dds
+    x = 0
+    i = 1
+    y= list()
+    while(i <= opt$perm) {
+      flog.info(paste0("...... Permutation ", i))
+      # Code to only shuffle the group labels for groups in coef
+      tempdesign <- designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),]
+      designperm[grep(paste(as.list(unlist(strsplit(opt$coef, "_"))) [c(2,4)],collapse = "|"),designperm[, opt$contrast]),][,opt$contrast] = sample(tempdesign[,opt$contrast])
+      if(sum(designperm$group == levels(designperm[, opt$contrast])[1]) != table(colData(dds)[,opt$contrast])[1]){
+        flog.info(paste0("......... Failed. Skipping design: ", paste(as.character(designperm$group),collapse = ",")))
+        flog.info(paste0("......... Number of reference replicates is: ", sum(designperm$group == levels(designperm[, opt$contrast])[1]), ", but should be: ", table(colData(dds)[,opt$contrast])[1]))
+        next
+      }
+      colData(ddsperm) <- designperm
+      ddsperm <- DESeq(ddsperm)
+      resperm <- results(ddsperm)
+      x[i] = length(subset(resperm, padj < opt$alpha)$padj)
+      y[[i]] = ddsperm$group
+      i = i+1
+    }
+    start_plot("Simulations", outdir=opt$outdir)
+    theme_set(theme_gray(base_size = 18))
+    sims = qplot(x,
+                 geom="histogram",
+                 breaks=seq(0, 20, by = 1),fill=I("grey"), col=I("black"),
+                 main = "Histogram of DE experiments\n with random group labels", 
+                 xlab = "Number of differentially expressed genes",
+                 ylab = "Number of simulations") +
+      geom_vline(xintercept = length(rownames(subset(res, padj < opt$alpha)))) +
+      theme_classic() + theme(plot.title = element_text(hjust = 0.5, size=22))
+    print(sims)
+    end_plot()
+    flog.info(paste0("... Permutation p value: ",
+                     length(x[x > length(rownames(subset(res, padj < opt$alpha)))])/length(x)))
+    z <- list()
+    for(i in 0:length(x)){
+      z[i] <- paste( unlist(y[i]), collapse=' ')
+    }
+    z <- unlist(z)
+    df <- data.frame(number=x, combination=z)
+    df
+    write_tsv(df,paste0(opt$outdir,"/","Simulations.tsv"))
+  }
+}
+
+main <- function() {
+
+    option_list <- list(
+        make_option(
+            "--rds-filename",
+            dest = "rds_filename",
+            type = "character",
+            default = "experiment.rds",
+            help = paste("filename with input data of counts")
+        ),
+        make_option(
+            "--model",
+            dest = "model",
+            type = "character",
+            default = "~group",
+            help = paste("model formula for GLM")
+        ),
+        make_option(
+          "--contrast",
+          dest = "contrast",
+          type = "character",
+          default = "group",
+          help = paste("contrast/factor to use for comparison")
+        ),
+        make_option(
+          "--coef",
+          dest = "coef",
+          type = "character",
+          default = "",
+          help = paste("Comparison of interest for DESeq2: e.g. group_MUT_vs_CTR, where MUT and CTR are in group")
+        ),	
+        make_option(
+          "--refgroup",
+          dest = "refgroup",
+          type = "character",
+          default = "CTR",
+          help = paste("reference group, e.g. CTR")
+        ),
+        make_option(
+            "--alpha",
+            dest = "alpha",
+            type = "numeric",
+            default = 0.05,
+            help = paste("Adjusted P value threshold")
+        ),
+        make_option(
+            "--shrinkage",
+            dest = "shrinkage",
+            type = "character",
+            default = "apeglm",
+            help = paste("Method for LFC shrinkage. Options are apeglm (default), ashr, normal")
+        ),
+         make_option(
+            "--userlist",
+            dest = "userlist",
+            type = "character",
+            default = "ENSG00000205927,ENSG00000130675,ENSG00000016082,ENSG00000070748,ENSG00000064300,ENSG00000078018",
+            help = paste("User defined genes to plot")
+        ),
+       make_option(
+            "--outdir",
+            dest = "outdir",
+            type = "character",
+            default = "experiment",
+            help = paste("Directory for output")
+        ),
+        make_option(
+            "--permute",
+            dest = "perm",
+            type = "integer",
+            default = 0,
+            help = paste("number of permutations")
+        ),
+        make_option(
+            "--pathways",
+            dest = "pathways",
+            type = "character",
+            default = "",
+            help = paste("Libraries for gsea")
+        )
+    )
+    opt <- experiment_start(option_list = option_list,
+                            description = description)
+
+    if (!is.null(opt$pathways)) {
+        opt$pathways = unlist(strsplit(opt$pathways, ","))
+    }
+    run(opt)
+    
+    experiment_stop()
+}
+
+main()
diff --git a/cgatpipelines/tasks/exploratory.R b/cgatpipelines/tasks/exploratory.R
new file mode 100755
index 000000000..d22da0cbd
--- /dev/null
+++ b/cgatpipelines/tasks/exploratory.R
@@ -0,0 +1,302 @@
+#' Differential expression analysis Script
+#'
+#' 
+#' Example usage:
+#' 
+#' cgatflow R exploratory --model=~ group --contrast=group --factor=mouse_id,collection_date,slice_depth,slice_number,pipette_visual,timepoint
+#'
+#' Primary Input: `filename.rds` - output from the readandfiltercounts.R file or DESeqExpermient object or DEGList object
+#' Additional inputs: model and contrast as well as factors of interest
+#'
+#' Output: png images of: PCA, clustering, Surrogate Variable Analysis, Heatmaps
+#'
+
+suppressMessages(library(getopt))
+suppressMessages(library(tidyverse))
+suppressMessages(library(data.table))
+suppressMessages(library(DESeq2))
+suppressMessages(library(edgeR))
+suppressMessages(library(rhdf5))
+suppressMessages(library(sva))
+suppressMessages(library(ggplot2))
+suppressMessages(library(biomaRt))
+suppressMessages(library(Cairo))
+suppressMessages(library(pheatmap))
+suppressMessages(library(RColorBrewer))
+suppressMessages(library(ggforce))
+
+source(file.path(Sys.getenv("R_ROOT"), "experiment.R"))
+
+
+# getmart
+# Function: wrapper for getBM for ENSEMBL gene list
+# Dependencies: biomaRt package
+# Input: vector of ensembl gene ids
+# Output: data frame with description, symbol and entrez gene
+mart = useMart(biomart = "ENSEMBL_MART_ENSEMBL",dataset="hsapiens_gene_ensembl", host = "jul2018.archive.ensembl.org")
+getmart <- function(values){
+  data<- getBM(
+    filters= "ensembl_gene_id", 
+    attributes= c("ensembl_gene_id", "external_gene_name", "description","entrezgene"),
+    values= values,
+    mart= mart,
+    useCache = FALSE)
+  data$description <- gsub("\t", "", data$description)
+  return(data)
+}
+
+
+start_plot <- function(section, height = 6, width = 6, type = "png", outdir="") {
+    file = get_output_filename(paste0(outdir,"/",section, ".", type))
+  Cairo(file = file,
+        type = type,
+        width = width,
+        height = height,
+        units="in",
+        dpi = 300,
+        bg = "white")
+  #opar <- par(lwd=0.5)
+}
+
+end_plot <- function() {
+  dev.off()
+}
+
+
+run <- function(opt) {
+
+  ### READING DATA ###
+  futile.logger::flog.info(paste("reading Experiment object from", normalizePath(opt$rds_filename)))
+  experiment <- readRDS(opt$rds_filename)
+  if (class(experiment) == "DESeqDataSet"){
+    dds <- experiment
+  } else if (class(experiment) == "DGEList"){
+    dds = DESeqDataSetFromMatrix(experiment$counts, experiment$sample, design = formula(opt$model))
+  }
+  futile.logger::flog.info(paste("reading  Experiment object", paste(dim(counts(experiment)), collapse = ",")))
+
+  ### TRANSFORMATION OF DATA ###
+  futile.logger::flog.info(paste("Transforming data"))
+  rld<- rlog(dds)
+  vsd<- vst(dds)
+  df <- bind_rows(
+    as_tibble(log2(counts(dds, normalized=TRUE)[, 1:2]+1)) %>%
+      mutate(transformation = "log2(x + 1)"),
+    as_tibble(assay(vsd)[, 1:2]) %>% mutate(transformation = "vst"),
+    as_tibble(assay(rld)[, 1:2]) %>% mutate(transformation = "rlog"))
+  colnames(df)[1:2] <- c("x", "y")
+  start_plot('Variance_Transformations', outdir=opt$outdir)
+    print(ggplot(df, aes(x = x, y = y)) + geom_hex(bins = 80) +
+      coord_fixed() + facet_grid( . ~ transformation))
+  end_plot()
+  
+  ### PRINCIPAL COMPONENT ANALYSIS ###
+  futile.logger::flog.info(paste("Performing Principal Component Analysis"))
+  pca = prcomp(t(assay(vsd)))
+  for(factor in opt$factors){
+    variable.group <- colData(dds)[, factor]
+    names(variable.group) <- factor
+    percentVar <- round(100 * summary(pca)$importance[2,])
+    futile.logger::flog.info(paste("number of PCA dimensions", dim(pca$x)))
+    if(dim(pca$x)[1]>7){
+      dim_pca = 8
+    } else {
+      dim_pca = dim(pca$x)
+    }
+    scores <- data.frame(variable.group, pca$x[,1:dim_pca])
+    start_plot(paste0('PCA_', factor), outdir=opt$outdir)
+      print(ggplot(scores,aes(x=PC1, y=PC2)) +
+        geom_point(colour=factor(variable.group)) +
+        theme(legend.position="right") +  
+        labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+             y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
+        ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) +
+        theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+        theme(text=element_text(family='serif')))
+    end_plot()
+    start_plot(paste0('PCA_13_', factor), outdir=opt$outdir)
+          print(ggplot(scores,aes(x=PC1, y=PC3)) +
+        geom_point(colour=factor(variable.group)) +
+        theme(legend.position="right") +  
+        labs(colour=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+             y=paste0("PC3 (", percentVar[3],"% of variance)")) + 
+        ggtitle("Principal Component Analysis") + theme_grey(base_size = 15) +
+        theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+        theme(text=element_text(family='serif')))
+    end_plot()
+  }
+  variable.group <- colData(dds)[, opt$contrast]
+  names(variable.group) <- opt$contrast
+  scores <- data.frame(variable.group, pca$x[,1:dim_pca])
+  start_plot(paste0('PCA_grid'), outdir=opt$outdir)
+  if(dim_pca==8){
+    print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
+      geom_point(shape = 16, size = 0.5, position = 'auto') + 
+      geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
+      facet_matrix(vars(PC1:PC8), layer.diag = 2))
+  }else{
+    print(ggplot(scores, aes(x = .panel_x, y = .panel_y, fill = variable.group, colour = variable.group)) + 
+      geom_point(shape = 16, size = 0.5, position = 'auto') + 
+      geom_autodensity(alpha = 0.3, colour = NA, position = 'identity') + 
+      facet_matrix(vars(everything()), layer.diag = 2))
+  }
+  end_plot()
+  
+  loadings <- pca$rotation[,1:dim_pca]
+  loadings <- data.frame(loadings[order(loadings[,1]), ])
+  data <- getmart(rownames(loadings))
+  loadings$symbol<-data$mgi_symbol[match(rownames(loadings), data$ensembl_gene_id)]
+  loadings$description <- data$description[match(rownames(loadings), data$ensembl_gene_id)]
+  write.table(loadings, file=paste0(opt$outdir,"/",'PCA_loadings.tsv'), quote=FALSE, sep='\t', row.names = TRUE, col.names = NA) 
+  
+  ### HEATMAPS ###
+  futile.logger::flog.info(paste("Performing Heatmap"))
+  df <- as.data.frame(colData(dds)[,opt$factors])
+  rownames(df) <- colData(dds)$track
+  # Heatmap of Top 20 Expressed Genes
+  select <- order(rowMeans(counts(dds,normalized=TRUE)),decreasing=TRUE)[1:20]
+  start_plot('Heatmap_topExpressed', outdir=opt$outdir)
+    pheatmap(assay(vsd)[select,], cluster_rows=FALSE, cluster_cols=FALSE, show_rownames=FALSE, annotation_col=df)
+  end_plot()
+  # Heatmap of Top 20 Variable Genes
+  topVarGenes <- head(order(rowVars(assay(vsd)),decreasing=TRUE),20)
+  mat <- assay(vsd)[ topVarGenes, ]
+  temp <- getmart(rownames(mat))
+  row.names(temp) <- temp$ensembl_gene_id
+  rownames(mat) <- temp[rownames(mat),"external_gene_name"]
+  start_plot('Heatmap_topVariable', outdir=opt$outdir)
+    pheatmap(mat, annotation_col=df, cluster_rows=FALSE,fontsize_row = 6)
+  end_plot()
+  # Heatmap of Genes of interest
+  if (!is.null(opt$genes_of_interest)) {
+    print(rownames(assay(vsd)))
+    goi <- opt$genes_of_interest
+    goi <- goi[goi %in% rownames(assay(vsd))]
+    mat <- assay(vsd)[goi, ]
+    mat <- mat - rowMeans(mat)
+    temp <- getmart(rownames(mat))
+    row.names(temp) <- temp$ensembl_gene_id
+    rownames(mat) <- temp[rownames(mat),"external_gene_name"]
+    start_plot('Heatmap_ofInterest', outdir=opt$outdir)
+      pheatmap(mat, annotation_col=df, scale = "row", cluster_cols = FALSE)
+    end_plot()
+  }
+
+  ### CLUSTERING ###
+  futile.logger::flog.info(paste("Clustering"))
+  # for all genes
+  sampleDists <- dist(t(assay(vsd)))
+  sampleDistMatrix <- as.matrix( sampleDists )
+  rownames(sampleDistMatrix) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " )
+  colnames(sampleDistMatrix) <- NULL
+  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
+  start_plot('Heatmap_all', outdir=opt$outdir)
+    pheatmap(sampleDistMatrix,
+    clustering_distance_rows = sampleDists,
+    clustering_distance_cols = sampleDists,
+    col = colors)
+  end_plot()
+  # for top 500 genes
+  top500 <- head(assay(vsd)[ order(rowMeans(assay(vsd)), decreasing=TRUE ),  ], n=500)
+  distsRL500 <- dist(t(top500))
+  sampleDistMatrix500 <- as.matrix( distsRL500 )
+  rownames(sampleDistMatrix500) <- paste(colData(vsd)[,opt$contrast], vsd$track, sep = " - " )
+  colnames(sampleDistMatrix500) <- NULL
+  colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
+  start_plot('Heatmap_top500', outdir=opt$outdir)
+    pheatmap(sampleDistMatrix500,
+    clustering_distance_rows = distsRL500,
+    clustering_distance_cols = distsRL500,
+    col = colors)
+  end_plot()
+  
+  ### EXPLORE BATCH EFFECTS ###
+  futile.logger::flog.info(paste("Exploring Batch Effects"))
+  futile.logger::flog.info(dim_pca)
+  if(dim_pca == 8){
+    for(factor in opt$factors){
+      factor_transformed <- vsd
+      assay(factor_transformed) <- limma::removeBatchEffect(assay(factor_transformed), colData(factor_transformed)[,factor])
+      pca = prcomp(t(assay(factor_transformed)))
+      sample.group <- as_factor(colData(dds)[, factor])
+      variable.group <- colData(dds)[, opt$contrast]
+      percentVar <- round(100 * summary(pca)$importance[2,])
+      scores <- data.frame(variable.group, sample.group, pca$x[,1:2])
+      start_plot(paste0('PCA_', factor, '_removed'), outdir=opt$outdir)
+      print(ggplot(scores,aes(x=PC1, y=PC2)) +
+            geom_point(colour=factor(variable.group), shape=factor(sample.group)) +
+            theme(legend.position="right") +  
+            labs(colour=opt$contrast, shape=factor, x=paste0("PC1 (", percentVar[1],"% of variance)"),
+                 y=paste0("PC2 (", percentVar[2],"% of variance)")) + 
+            ggtitle(paste0("Principal Component Analysis\n after batch correction for ", factor)) + 
+            theme_grey(base_size = 15) +
+            theme(plot.title = element_text(lineheight=1, face="bold"))  + geom_point(size=2) +
+            theme(text=element_text(family='serif')))
+      end_plot()
+    }
+  }
+  else {
+      futile.logger::flog.info(paste("... not done - too few degrees of freedom"))
+  }
+}
+
+main <- function() {
+  option_list <- list(
+    make_option(
+      "--rds-filename",
+      dest = "rds_filename",
+      type = "character",
+      default = "experiment.rds",
+      help = paste("filename with input data of counts")
+    ),
+    make_option(
+      "--model",
+      dest = "model",
+      type = "character",
+      default = "~ group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--contrast",
+      dest = "contrast",
+      type = "character",
+      default = "group",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--factors",
+      dest = "factors",
+      type = "character",
+      default = "",
+      help = paste("formula for multivariate model")
+    ),
+    make_option(
+      "--genes_of_interest",
+      dest = "genes_of_interest",
+      type = "character",
+      default = NULL,
+      help = paste("genes of interest for plotting")
+    ),
+    make_option(
+      c("-o","--outdir"),
+      dest = "outdir",
+      type = "character",
+      default = "",
+      help = paste("genes of interest for plotting")
+    )
+  )
+  opt <- experiment_start(option_list = option_list,
+                          description = description)
+  
+  if (!is.null(opt$factors)) {
+    opt$factors = unlist(strsplit(opt$factors, ","))
+  }
+  if (!is.null(opt$genes_of_interest)) {
+    opt$genes_of_interest = unlist(strsplit(opt$genes_of_interest, ","))
+  }
+  run(opt)
+  
+  experiment_stop()
+}
+
+main()

From 563dd06bc0efb3dea2649c1f48d3d3c80db78161 Mon Sep 17 00:00:00 2001
From: Jakub Scaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Tue, 17 Dec 2024 20:09:14 +0000
Subject: [PATCH 42/43] Changes to Make GTFsubset compatible with sqlachemy
 changes

---
 cgatpipelines/tasks/gtfsubset.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/cgatpipelines/tasks/gtfsubset.py b/cgatpipelines/tasks/gtfsubset.py
index 62c467fc8..cd9c1c0fd 100644
--- a/cgatpipelines/tasks/gtfsubset.py
+++ b/cgatpipelines/tasks/gtfsubset.py
@@ -14,6 +14,7 @@
 import cgat.GTF as GTF
 import cgatcore.pipeline as P
 import cgatcore.database as Database
+from sqlalchemy import text
 
 
 class SubsetGTF():
@@ -142,10 +143,12 @@ def getRepeatDataFromUCSC(dbhandle,
        expression given.
 
     '''
-    cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'")
-    tables = [x[0] for x in cc.fetchall()]
-    if len(tables) == 0:
-        raise ValueError("could not find any `rmsk` tables")
+    with dbhandle.connect() as conn:
+        query = "SHOW TABLES LIKE '%%rmsk'"
+        cc = conn.execute(text(query))
+        tables = [x[0] for x in cc.fetchall()]
+        if len(tables) == 0:
+            raise ValueError("could not find any `rmsk` tables")
 
     # now collect repeats
     tmpfile = P.get_temp_file(".")
@@ -166,9 +169,10 @@ def getRepeatDataFromUCSC(dbhandle,
         sql = sql % locals()
 
         E.debug("executing sql statement: %s" % sql)
-        cc = dbhandle.execute(sql)
-        for data in cc.fetchall():
-            tmpfile.write("\t".join(map(str, data)) + "\n")
+        with dbhandle.connect() as conn:
+            cc = conn.execute(text(sql))
+            for data in cc.fetchall():
+                tmpfile.write("\t".join(map(str, data)) + "\n")
 
     tmpfile.close()
 

From d960eedd18eb40b635ccec178c8453320d563f3f Mon Sep 17 00:00:00 2001
From: Jakub Scaber <jakub.scaber@ndcn.ox.ac.uk>
Date: Tue, 17 Dec 2024 20:42:41 +0000
Subject: [PATCH 43/43] remove strand specificity which has been removed from
 cgat apps

---
 cgatpipelines/tools/pipeline_bamstats.py | 27 +-----------------------
 1 file changed, 1 insertion(+), 26 deletions(-)

diff --git a/cgatpipelines/tools/pipeline_bamstats.py b/cgatpipelines/tools/pipeline_bamstats.py
index 2f4b802ce..f2d12771a 100644
--- a/cgatpipelines/tools/pipeline_bamstats.py
+++ b/cgatpipelines/tools/pipeline_bamstats.py
@@ -291,22 +291,6 @@ def buildRrnaIntervals(infile, outfile):
 #########################################################################
 
 
-@follows(mkdir("StrandSpec.dir"))
-@transform("*.bam",
-           suffix(".bam"),
-           r"StrandSpec.dir/\1.strand")
-def strandSpecificity(infile, outfile):
-    '''This function will determine the strand specificity of your library
-    from the bam file'''
-
-    statement = (
-        "cgat bam2libtype "
-        "--max-iterations 10000 "
-        "< {infile} "
-        "> {outfile}".format(**locals()))
-    return P.run(statement)
-
-
 @follows(mkdir("BamFiles.dir"))
 @transform("*.bam",
            regex("(.*).bam$"),
@@ -824,14 +808,6 @@ def loadTranscriptProfile(infiles, outfile):
     bamstats.loadTranscriptProfile(infiles, outfile)
 
 
-@P.add_doc(bamstats.loadStrandSpecificity)
-@jobs_limit(PARAMS.get("jobs_limit_db", 1), "db")
-@follows(loadTranscriptProfile)
-@merge(strandSpecificity, "strand_spec.load")
-def loadStrandSpecificity(infiles, outfile):
-    ''' merge strand specificity data into a single table'''
-    bamstats.loadStrandSpecificity(infiles, outfile)
-
 
 @merge((loadBAMStats, loadPicardStats, loadContextStats), "view_mapping.load")
 def createViewMapping(infile, outfile):
@@ -876,8 +852,7 @@ def views():
          loadIdxStats,
          loadExonValidation,
          loadPicardRnaSeqMetrics,
-         loadTranscriptProfile,
-         loadStrandSpecificity)
+         loadTranscriptProfile)
 def full():
     '''a dummy task to run all tasks in the pipeline'''
     pass