Merge pull request #31 from NEleanor/master

added current moon_tools
ohsu-comp-bio · Jun 26, 2019 · 4aa39c9 · 4aa39c9
2 parents 2eda0fa + 691561a
commit 4aa39c9
Show file tree

Hide file tree

Showing 28 changed files with 2,613 additions and 0 deletions.
diff --git a/moon_tools/Moon_filter_galaxy.py b/moon_tools/Moon_filter_galaxy.py
diff --git a/moon_tools/de_dup.py b/moon_tools/de_dup.py
@@ -0,0 +1,59 @@
+""" de_dup.py
+    Script that takes two text files with lists of genes and returns a text file with a list of the genes from the medical list with duplicates removed
+    Eleanor Campbell
+"""
+
+import sys
+
+def build_list(file_name):
+
+    file = open(file_name, 'r')
+
+    my_list = []
+
+    for line in file:
+        entry = line.strip()
+        entry = entry.strip(' ').upper()
+        my_list.append(entry)
+
+    my_list.sort()
+
+    file.close()
+
+    return my_list
+
+def de_dup(med_list, hpo_list):
+
+    new_list = []
+
+    for gene in med_list:
+        if gene not in hpo_list:
+            new_list.append(gene)
+
+    return new_list
+
+
+def write_list(my_list, new_name):
+
+    file = open(new_name, 'w')
+
+    for entry in my_list:
+        file.write(entry + '\n')
+
+    file.close()
+
+
+def main(med_genes, hpo_genes):
+    med_gene_list = build_list(med_genes)
+    hpo_gene_list = build_list(hpo_genes)
+    de_duped_list = de_dup(med_gene_list, hpo_gene_list)
+
+    new_name = 'output.txt'
+    #hpo_genes.split('.')[0] + '_de_duped.txt'
+
+    write_list(de_duped_list, new_name)
+
+if __name__ == "__main__":
+    args = sys.argv
+    main(args[1], args[2])
+
diff --git a/moon_tools/de_dup.xml b/moon_tools/de_dup.xml
@@ -0,0 +1,21 @@
+<tool id="de_dup_hpo_list" name="DeDup HPO Gene List" version="0.0.1">
+
+    <description>Removes all the HPO genes from the medical exome list </description>
+
+    <requirements></requirements>
+
+    <command detect_errors="exit_code"><![CDATA[
+        python $__tool_directory__/de_dup.py "$med_list" "$hpo_list"
+    ]]></command>
+
+    <inputs>
+        <param type='data' name='hpo_list' label='HPO Gene List' help='List of genes derived from the HPO terms.'/>
+        <param type='data' name='med_list' label='Medical Exome Gene List' help='List of genes on the medical exome list.'/>
+    </inputs>
+    <outputs>
+        <data format='txt' name='de_duped_list' from_work_dir='output.txt' help='List of genes from the medical exome NOT in the HPO gene list.'/>
+    </outputs>
+    <tests></tests>
+    <help>Takes the medical exome and removes all genes which have already been identified in the HPO gene list.</help>
+    <citations></citations>
+</tool>
diff --git a/moon_tools/exomiser/application.properties b/moon_tools/exomiser/application.properties
@@ -0,0 +1,58 @@
+#
+# The Exomiser - A tool to annotate and prioritize genomic variants
+#
+# Copyright (c) 2016-2018 Queen Mary University of London.
+# Copyright (c) 2012-2016 Charit� Universit�tsmedizin Berlin and Genome Research Ltd.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+#root path where data is to be downloaded and worked on
+#it is assumed that all the files required by exomiser listed in this properties file
+#will be found in the data directory unless specifically overridden here.
+exomiser.data-directory=/place_holder_directory/data/
+### hg19 assembly ###
+exomiser.hg19.data-version=1805
+#transcript source will default to ensembl. Can define as ucsc/ensembl/refseq
+#exomiser.hg19.transcript-source=ensembl
+#exomiser.hg19.data-directory=${exomiser.data-directory}/${exomiser.hg19.data-version}_hg19
+#location of CADD/REMM Tabix files - you will need these for analysis of non-coding variants.
+#CADD can be downloaded from http://cadd.gs.washington.edu/download - v1.3 has been tested.
+#REMM can be downloaded from https://charite.github.io/software-remm-score.html
+#local frequencies are required to be normalised in the same manner as the input VCF and frequency values must be percentages.
+#
+#You will require the tsv.gz and tsv.gz.tbi (tabix) file pairs.
+#Un-comment and add the full path to the relevant tsv.gz files if you want to enable these.
+#exomiser.hg19.cadd-snv-path=${exomiser.hg19.data-directory}/whole_genome_SNVs.tsv.gz
+#exomiser.hg19.cadd-in-del-path=${exomiser.hg19.data-directory}/InDels.tsv.gz
+#exomiser.hg19.remm-path=${exomiser.hg19.data-directory}/remmData.tsv.gz
+#exomiser.hg19.local-frequency-path=${exomiser.hg19.data-directory}/local_frequency_test.tsv.gz
+### hg38 assembly ###
+# To enable analysis of samples called against the hg38 assembly copy the hg19 above and just replace the hg19 with hg38
+#exomiser.hg38.data-version=1805
+### phenotypes ###
+exomiser.phenotype.data-version=1807
+#exomiser.phenotype.data-directory=${exomiser.data-directory}/${exomiser.phenotype.data-version}_phenotype
+#String random walk data file
+#exomiser.phenotype.random-walk-file-name=/home/users/campbena/rw_string_9_05.mv
+#exomiser.phenotype.random-walk-index-file-name=rw_string_9_05_id2index.gz
+### caching ###
+#If you're running exomiser in batch mode there might be some performance benefit
+#if you enable caching. The 'simple' option will continue to store data in memory *without*
+#limit - this means for really long-running batch jobs and/or whole genomes you may run out of memory.
+#If this is likely choose the caffeine option and uncomment spring.cache.caffeine.spec and adjust the cache size
+#to your requirements
+#none/simple/caffeine
+#spring.cache.type=none
+#spring.cache.caffeine.spec=maximumSize=60000
diff --git a/moon_tools/exomiser/exomiser.xml b/moon_tools/exomiser/exomiser.xml
@@ -0,0 +1,153 @@
+<tool id="exomiser11" name="Exomiser v11.0.0" version="11.0.0.0" >
+  <description>A Tool to Annotate and Prioritize Exome Variants</description>
+  <requirements>
+    <requirement type="package" version="4.0.5.1">gatk4</requirement>
+  </requirements>
+  <command detect_errors="exit_code"><![CDATA[
+  echo "starting" >> $log &&
+  export JAVA8_PATH="\$JAVA8_PATH" &&
+  java -version >> $log &&
+  ln -s $proband index_needed.vcf &&
+  gatk IndexFeatureFile -F index_needed.vcf 2>> $log &&
+  gatk RenameSampleInVcf -I index_needed.vcf -O proband.vcf --NEW_SAMPLE_NAME proband 2>> $log &&
+  #if $trio.trio_on:
+    gatk RenameSampleInVcf -I $trio.mother -O mother.vcf --NEW_SAMPLE_NAME mother 2>> $log &&
+    gatk RenameSampleInVcf -I $trio.father -O father.vcf --NEW_SAMPLE_NAME father 2>> $log &&
+    gatk CombineGVCFs -O trio.vcf -V proband.vcf -V mother.vcf -V father.vcf 
+    #if $trio.reference_source.reference_source_selector != "no_ref"
+            #if $trio.reference_source.reference_source_selector != "history"
+                --reference ${trio.reference_source.reference_sequence.fields.path}
+            #else
+                --reference ${trio.reference_source.reference_sequence}
+            #end if
+    #end if
+    2>> $log &&
+    ln -s $__tool_directory__/${trio.gender}.ped ${trio.gender}.ped &&
+  #end if
+
+  #if $hpo_selector.source == "moon":
+    ln -s $__tool_directory__/moon_api.py moon_api.py &&
+    $__tool_directory__/hpo_from_moon_python.sh $hpo_selector.moon_id 2>> $log &&  
+  #else if $hpo_selector.source == "text_input":
+    echo '$hpo_selector.hpo_terms' > hpo_pre.txt &&
+    sed "s/__sq__/'/g" hpo_pre.txt > hpo.txt &&
+  #else:
+    ln -s $hpo_selector.hpo_terms hpo.txt &&
+  #end if
+
+
+  cat 
+  #if $trio.trio_on:
+    $__tool_directory__/trio_pre_${trio.gender}.yml
+  #else:
+    $__tool_directory__/pre.yml
+  #end if
+  
+  hpo.txt
+
+  $__tool_directory__/post.yml > analysis.yml &&
+
+  \$JAVA8_PATH -Xms2g -Xmx4g -jar $__tool_directory__/exomiser-cli-11.0.0/exomiser-cli-11.0.0.jar --analysis analysis.yml >> $log &&
+  Rscript $__tool_directory__/exomiser_merger.R output_AR.variants.tsv output_AD.variants.tsv output_XD.variants.tsv output_XR.variants.tsv output_MT.variants.tsv >> $log
+]]>
+
+  </command>
+
+  <inputs>
+    <param name="proband" type="data" format="vcf" label="VCF of Proband"/>
+    <conditional name="trio">
+      <param name="trio_on" type="boolean" checked="false" label="Trio?" help="If running a trio, you must use GVCFs."/>
+      <when value="true">
+        <param name="mother" type="data" format="vcf" label="VCF of Mother"/>
+        <param name="father" type="data" format="vcf" label="VCF of Father"/>
+        <param name="gender" type="select" label="Gender">
+          <option value="female">Female</option>
+          <option value="male">Male</option>
+          <option value="other">Other</option>
+        </param>
+        <conditional name="reference_source">
+            <param name="reference_source_selector" type="select" label="Choose the source for the reference list">
+                <option value="cached">Locally cached</option>
+                <option value="history">History</option>
+                <option value="no_ref" selected="true">Do not pass</option>
+            </param>
+            <when value="cached">
+                <param name="reference_sequence" type="select" label="Reference" help="Reference sequence file." >
+                    <options from_data_table="bwa_indexes" >
+                        <validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file" />
+                    </options>
+                </param>
+            </when>
+            <when value="history">
+                <param name="reference_sequence" type="data" format="fasta" label="Reference" help="Reference sequence file." />
+            </when>
+        </conditional>
+      </when>
+    </conditional> 
+    <conditional name="hpo_selector">
+      <param name="source" type="select" label="Select Source for HPO terms">
+        <option value="text_input">Input Text</option>
+        <option value="file_input">Choose File</option>
+        <option value="moon" selected="true">Query Moon</option>
+      </param>
+      <when value="text_input">
+        <param name="hpo_terms" type="text" label="HPO Terms"/>
+      </when>
+      <when value="file_input">
+        <param name="hpo_terms" type="data" format="txt" label="File with properly formatted HPO terms."/>
+      </when>
+      <when value="moon">
+        <param name="moon_id" type="integer" label="Moon ID" value="0" help="Find the case in Moon, the ID is the number in the URL."/>
+	<param name="requestor" type="select" label="Requestor" help="Whether to use curl or python to request the information from Moon.">
+          <option value="curl">curl</option>
+          <option value="python" selected="true">python</option>
+        </param>
+      </when>
+    </conditional>
+
+  </inputs>
+
+  <outputs>
+    <data name="AD_genes" format="tsv" from_work_dir="output_AD.genes.tsv" label="${tool.name} on ${on_string}: AD Genes"/>
+    <data name="AD_variants" format="tsv" from_work_dir="output_AD.variants.tsv" label="${tool.name} on ${on_string}: AD Variants"/>
+    <data name="AD_vcf" format="vcf" from_work_dir="output_AD.vcf" label="${tool.name} on ${on_string}: AD VCF"/>
+    <data name="AR_genes" format="tsv" from_work_dir="output_AR.genes.tsv" label="${tool.name} on ${on_string}: AR Genes"/>
+    <data name="AR_variants" format="tsv" from_work_dir="output_AR.variants.tsv" label="${tool.name} on ${on_string}: AR Variants"/>
+    <data name="AR_vcf" format="vcf" from_work_dir="output_AR.vcf" label="${tool.name} on ${on_string}: AR VCF"/>
+    <data name="MT_genes" format="tsv" from_work_dir="output_MT.genes.tsv" label="${tool.name} on ${on_string}: MT Genes"/>
+    <data name="MT_variants" format="tsv" from_work_dir="output_MT.variants.tsv" label="${tool.name} on ${on_string}: MT Variants"/>
+    <data name="MT_vcf" format="vcf" from_work_dir="output_MT.vcf" label="${tool.name} on ${on_string}: MT VCF"/>
+    <data name="XD_genes" format="tsv" from_work_dir="output_XD.genes.tsv" label="${tool.name} on ${on_string}: XD Genes"/>
+    <data name="XD_variants" format="tsv" from_work_dir="output_XD.variants.tsv" label="${tool.name} on ${on_string}: XD Variants"/>
+    <data name="XD_vcf" format="vcf" from_work_dir="output_XD.vcf" label="${tool.name} on ${on_string}: XD VCF"/>
+    <data name="XR_genes" format="tsv" from_work_dir="output_XR.genes.tsv" label="${tool.name} on ${on_string}: XR Genes"/>
+    <data name="XR_variants" format="tsv" from_work_dir="output_XR.variants.tsv" label="${tool.name} on ${on_string}: XR Variants"/>
+    <data name="XR_vcf" format="vcf" from_work_dir="output_XR.vcf" label="${tool.name} on ${on_string}: XR VCF"/>
+    <data name="html_output" format="html" from_work_dir="output.html" label="${tool.name} on ${on_string}: HTML"/>
+    <data name="json_output" format="json" from_work_dir="output.json" label="${tool.name} on ${on_string}: JSON"/>
+    <data name="analysis_file" label="Exomiser YML" from_work_dir="analysis.yml" format="txt"/>
+    <data name="log" format="txt" label="Exomiser log"/>
+    <data name="merged_sorted" format="csv" from_work_dir="merged_sorted.csv" label="${tool.name} on ${on_string}: Merged and Sorted"/>
+    <data name="top_ten" format="csv" from_work_dir="top_ten.csv" label="${tool.name} on ${on_string}: Top Ten"/>
+  </outputs>
+
+  <tests></tests>
+
+  <help>
+The Exomiser is a Java program that finds potential disease-causing variants from whole-exome or whole-genome sequencing data.
+
+Starting from a VCF file and a set of phenotypes encoded using the Human Phenotype Ontology (HPO) it will annotate, filter and prioritise likely causative variants. The program does this based on user-defined criteria such as a variant's predicted pathogenicity, frequency of occurrence in a population and also how closely the given phenotype matches the known phenotype of diseased genes from human and model organism data.
+
+The functional annotation of variants is handled by Jannovar and uses UCSC KnownGene transcript definitions and hg19 genomic coordinates.
+
+Variants are prioritised according to user-defined criteria on variant frequency, pathogenicity, quality, inheritance pattern, and model organism phenotype data. Predicted pathogenicity data is extracted from the dbNSFP resource. Variant frequency data is taken from the 1000 Genomes, ESP and ExAC datasets. Subsets of these frequency and pathogenicity data can be defined to further tune the analysis. Cross-species phenotype comparisons come from our PhenoDigm tool powered by the OWLTools OWLSim algorithm.
+
+The Exomiser was developed by the Computational Biology and Bioinformatics group at the Institute for Medical Genetics and Human Genetics of the Charité - Universitätsmedizin Berlin, the Mouse Informatics Group at the Sanger Institute and other members of the Monarch initiative.
+  </help>
+
+  <citations>
+    <citation type="doi">10.1101/gr.160325.113</citation>
+  </citations>
+
+</tool>
+
diff --git a/moon_tools/exomiser/exomiser_merger.R b/moon_tools/exomiser/exomiser_merger.R
@@ -0,0 +1,35 @@
+### An R script to take in the exomiser output, merge them together and sort.
+
+#get the command line inputs
+args <- commandArgs(TRUE)
+
+
+#loop over the input files
+count = 1
+for(i in args) {
+	#if it's the first input file then just create a new data frame
+	if ( count == 1 ) {
+		exomiser_data <- read.table(args[count], header = TRUE, sep = '\t', comment.char = "")
+		count = count + 1
+	}	else {
+		#if it is not the first input file then create a new data frame and concatenate with the old one
+		new_data <- setNames(read.table(args[count], header = TRUE, sep = '\t', comment.char = ""), names(exomiser_data))
+		exomiser_data <- rbind(exomiser_data, new_data)
+		count = count + 1
+	}
+}
+
+
+print(names(exomiser_data))
+#sort the data by EXOMISER_GENE_COMBINED_SCORE
+exomiser_data <- exomiser_data[order(-exomiser_data$EXOMISER_GENE_COMBINED_SCORE),]
+#get just the top ten variants
+top_ten <- exomiser_data[c(1:10),]
+
+#write the merged and sorted dataframe to a csv
+
+write.csv(exomiser_data, file = "merged_sorted.csv")
+
+#write the top ten data to a csv
+
+write.csv(top_ten, file = "top_ten.csv")
diff --git a/moon_tools/exomiser/female.ped b/moon_tools/exomiser/female.ped
@@ -0,0 +1,3 @@
+FAM1	father	0	0	1	1
+FAM1	mother	0	0	2	1
+FAM1	proband	father	mother	2	2
diff --git a/moon_tools/exomiser/hpo_from_moon.sh b/moon_tools/exomiser/hpo_from_moon.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+
+HPOS=$(curl -X GET -d "user_token=j4yKzsZivJuCXzoxdUM6" -d "[email protected]" https://oregon.moon.diploid.com/samples/$1/patient-info | grep 'showterm?id=HP:' | sed "s/[<][^>]*[>]//g")
+
+COUNT=1
+
+for LINE in $HPOS; do
+  if [[ $LINE == *HP:* ]]; then
+    if [[ $COUNT == 1 ]]; then
+      COUNT=2
+      echo -n "'$LINE'" > hpo.txt
+    else
+      echo -n ",'$LINE'" >> hpo.txt
+    fi
+  fi
+done
diff --git a/moon_tools/exomiser/hpo_from_moon_python.sh b/moon_tools/exomiser/hpo_from_moon_python.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+
+python moon_api.py -mode info -id $1 
+
+HPOS=$(grep 'showterm?id=HP:' info.txt | sed "s/[<][^>]*[>]//g")
+
+COUNT=1
+
+for LINE in $HPOS; do
+  if [[ $LINE == *HP:* ]]; then
+    if [[ $COUNT == 1 ]]; then
+      COUNT=2
+      echo -n "'$LINE'" > hpo.txt
+    else
+      echo -n ",'$LINE'" >> hpo.txt
+    fi
+  fi
+done
diff --git a/moon_tools/exomiser/male.ped b/moon_tools/exomiser/male.ped
@@ -0,0 +1,3 @@
+FAM1	father	0	0	1	1
+FAM1	mother	0	0	2	1
+FAM1	proband	father	mother	1	2