Skip to content

Commit

Permalink
Merge pull request #31 from NEleanor/master
Browse files Browse the repository at this point in the history
added current moon_tools
  • Loading branch information
crutching authored Jun 26, 2019
2 parents 2eda0fa + 691561a commit 4aa39c9
Show file tree
Hide file tree
Showing 28 changed files with 2,613 additions and 0 deletions.
594 changes: 594 additions & 0 deletions moon_tools/Moon_filter_galaxy.py

Large diffs are not rendered by default.

59 changes: 59 additions & 0 deletions moon_tools/de_dup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
""" de_dup.py
Script that takes two text files with lists of genes and returns a text file with a list of the genes from the medical list with duplicates removed
Eleanor Campbell
"""

import sys

def build_list(file_name):

file = open(file_name, 'r')

my_list = []

for line in file:
entry = line.strip()
entry = entry.strip(' ').upper()
my_list.append(entry)

my_list.sort()

file.close()

return my_list

def de_dup(med_list, hpo_list):

new_list = []

for gene in med_list:
if gene not in hpo_list:
new_list.append(gene)

return new_list


def write_list(my_list, new_name):

file = open(new_name, 'w')

for entry in my_list:
file.write(entry + '\n')

file.close()


def main(med_genes, hpo_genes):
med_gene_list = build_list(med_genes)
hpo_gene_list = build_list(hpo_genes)
de_duped_list = de_dup(med_gene_list, hpo_gene_list)

new_name = 'output.txt'
#hpo_genes.split('.')[0] + '_de_duped.txt'

write_list(de_duped_list, new_name)

if __name__ == "__main__":
args = sys.argv
main(args[1], args[2])

21 changes: 21 additions & 0 deletions moon_tools/de_dup.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<tool id="de_dup_hpo_list" name="DeDup HPO Gene List" version="0.0.1">

<description>Removes all the HPO genes from the medical exome list </description>

<requirements></requirements>

<command detect_errors="exit_code"><![CDATA[
python $__tool_directory__/de_dup.py "$med_list" "$hpo_list"
]]></command>

<inputs>
<param type='data' name='hpo_list' label='HPO Gene List' help='List of genes derived from the HPO terms.'/>
<param type='data' name='med_list' label='Medical Exome Gene List' help='List of genes on the medical exome list.'/>
</inputs>
<outputs>
<data format='txt' name='de_duped_list' from_work_dir='output.txt' help='List of genes from the medical exome NOT in the HPO gene list.'/>
</outputs>
<tests></tests>
<help>Takes the medical exome and removes all genes which have already been identified in the HPO gene list.</help>
<citations></citations>
</tool>
58 changes: 58 additions & 0 deletions moon_tools/exomiser/application.properties
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#
# The Exomiser - A tool to annotate and prioritize genomic variants
#
# Copyright (c) 2016-2018 Queen Mary University of London.
# Copyright (c) 2012-2016 Charit� Universit�tsmedizin Berlin and Genome Research Ltd.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#

#root path where data is to be downloaded and worked on
#it is assumed that all the files required by exomiser listed in this properties file
#will be found in the data directory unless specifically overridden here.
exomiser.data-directory=/place_holder_directory/data/
### hg19 assembly ###
exomiser.hg19.data-version=1805
#transcript source will default to ensembl. Can define as ucsc/ensembl/refseq
#exomiser.hg19.transcript-source=ensembl
#exomiser.hg19.data-directory=${exomiser.data-directory}/${exomiser.hg19.data-version}_hg19
#location of CADD/REMM Tabix files - you will need these for analysis of non-coding variants.
#CADD can be downloaded from http://cadd.gs.washington.edu/download - v1.3 has been tested.
#REMM can be downloaded from https://charite.github.io/software-remm-score.html
#local frequencies are required to be normalised in the same manner as the input VCF and frequency values must be percentages.
#
#You will require the tsv.gz and tsv.gz.tbi (tabix) file pairs.
#Un-comment and add the full path to the relevant tsv.gz files if you want to enable these.
#exomiser.hg19.cadd-snv-path=${exomiser.hg19.data-directory}/whole_genome_SNVs.tsv.gz
#exomiser.hg19.cadd-in-del-path=${exomiser.hg19.data-directory}/InDels.tsv.gz
#exomiser.hg19.remm-path=${exomiser.hg19.data-directory}/remmData.tsv.gz
#exomiser.hg19.local-frequency-path=${exomiser.hg19.data-directory}/local_frequency_test.tsv.gz
### hg38 assembly ###
# To enable analysis of samples called against the hg38 assembly copy the hg19 above and just replace the hg19 with hg38
#exomiser.hg38.data-version=1805
### phenotypes ###
exomiser.phenotype.data-version=1807
#exomiser.phenotype.data-directory=${exomiser.data-directory}/${exomiser.phenotype.data-version}_phenotype
#String random walk data file
#exomiser.phenotype.random-walk-file-name=/home/users/campbena/rw_string_9_05.mv
#exomiser.phenotype.random-walk-index-file-name=rw_string_9_05_id2index.gz
### caching ###
#If you're running exomiser in batch mode there might be some performance benefit
#if you enable caching. The 'simple' option will continue to store data in memory *without*
#limit - this means for really long-running batch jobs and/or whole genomes you may run out of memory.
#If this is likely choose the caffeine option and uncomment spring.cache.caffeine.spec and adjust the cache size
#to your requirements
#none/simple/caffeine
#spring.cache.type=none
#spring.cache.caffeine.spec=maximumSize=60000
153 changes: 153 additions & 0 deletions moon_tools/exomiser/exomiser.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
<tool id="exomiser11" name="Exomiser v11.0.0" version="11.0.0.0" >
<description>A Tool to Annotate and Prioritize Exome Variants</description>
<requirements>
<requirement type="package" version="4.0.5.1">gatk4</requirement>
</requirements>
<command detect_errors="exit_code"><![CDATA[
echo "starting" >> $log &&
export JAVA8_PATH="\$JAVA8_PATH" &&
java -version >> $log &&
ln -s $proband index_needed.vcf &&
gatk IndexFeatureFile -F index_needed.vcf 2>> $log &&
gatk RenameSampleInVcf -I index_needed.vcf -O proband.vcf --NEW_SAMPLE_NAME proband 2>> $log &&
#if $trio.trio_on:
gatk RenameSampleInVcf -I $trio.mother -O mother.vcf --NEW_SAMPLE_NAME mother 2>> $log &&
gatk RenameSampleInVcf -I $trio.father -O father.vcf --NEW_SAMPLE_NAME father 2>> $log &&
gatk CombineGVCFs -O trio.vcf -V proband.vcf -V mother.vcf -V father.vcf
#if $trio.reference_source.reference_source_selector != "no_ref"
#if $trio.reference_source.reference_source_selector != "history"
--reference ${trio.reference_source.reference_sequence.fields.path}
#else
--reference ${trio.reference_source.reference_sequence}
#end if
#end if
2>> $log &&
ln -s $__tool_directory__/${trio.gender}.ped ${trio.gender}.ped &&
#end if
#if $hpo_selector.source == "moon":
ln -s $__tool_directory__/moon_api.py moon_api.py &&
$__tool_directory__/hpo_from_moon_python.sh $hpo_selector.moon_id 2>> $log &&
#else if $hpo_selector.source == "text_input":
echo '$hpo_selector.hpo_terms' > hpo_pre.txt &&
sed "s/__sq__/'/g" hpo_pre.txt > hpo.txt &&
#else:
ln -s $hpo_selector.hpo_terms hpo.txt &&
#end if
cat
#if $trio.trio_on:
$__tool_directory__/trio_pre_${trio.gender}.yml
#else:
$__tool_directory__/pre.yml
#end if
hpo.txt
$__tool_directory__/post.yml > analysis.yml &&
\$JAVA8_PATH -Xms2g -Xmx4g -jar $__tool_directory__/exomiser-cli-11.0.0/exomiser-cli-11.0.0.jar --analysis analysis.yml >> $log &&
Rscript $__tool_directory__/exomiser_merger.R output_AR.variants.tsv output_AD.variants.tsv output_XD.variants.tsv output_XR.variants.tsv output_MT.variants.tsv >> $log
]]>

</command>

<inputs>
<param name="proband" type="data" format="vcf" label="VCF of Proband"/>
<conditional name="trio">
<param name="trio_on" type="boolean" checked="false" label="Trio?" help="If running a trio, you must use GVCFs."/>
<when value="true">
<param name="mother" type="data" format="vcf" label="VCF of Mother"/>
<param name="father" type="data" format="vcf" label="VCF of Father"/>
<param name="gender" type="select" label="Gender">
<option value="female">Female</option>
<option value="male">Male</option>
<option value="other">Other</option>
</param>
<conditional name="reference_source">
<param name="reference_source_selector" type="select" label="Choose the source for the reference list">
<option value="cached">Locally cached</option>
<option value="history">History</option>
<option value="no_ref" selected="true">Do not pass</option>
</param>
<when value="cached">
<param name="reference_sequence" type="select" label="Reference" help="Reference sequence file." >
<options from_data_table="bwa_indexes" >
<validator type="no_options" message="A built-in reference genome is not available for the build associated with the selected input file" />
</options>
</param>
</when>
<when value="history">
<param name="reference_sequence" type="data" format="fasta" label="Reference" help="Reference sequence file." />
</when>
</conditional>
</when>
</conditional>
<conditional name="hpo_selector">
<param name="source" type="select" label="Select Source for HPO terms">
<option value="text_input">Input Text</option>
<option value="file_input">Choose File</option>
<option value="moon" selected="true">Query Moon</option>
</param>
<when value="text_input">
<param name="hpo_terms" type="text" label="HPO Terms"/>
</when>
<when value="file_input">
<param name="hpo_terms" type="data" format="txt" label="File with properly formatted HPO terms."/>
</when>
<when value="moon">
<param name="moon_id" type="integer" label="Moon ID" value="0" help="Find the case in Moon, the ID is the number in the URL."/>
<param name="requestor" type="select" label="Requestor" help="Whether to use curl or python to request the information from Moon.">
<option value="curl">curl</option>
<option value="python" selected="true">python</option>
</param>
</when>
</conditional>

</inputs>

<outputs>
<data name="AD_genes" format="tsv" from_work_dir="output_AD.genes.tsv" label="${tool.name} on ${on_string}: AD Genes"/>
<data name="AD_variants" format="tsv" from_work_dir="output_AD.variants.tsv" label="${tool.name} on ${on_string}: AD Variants"/>
<data name="AD_vcf" format="vcf" from_work_dir="output_AD.vcf" label="${tool.name} on ${on_string}: AD VCF"/>
<data name="AR_genes" format="tsv" from_work_dir="output_AR.genes.tsv" label="${tool.name} on ${on_string}: AR Genes"/>
<data name="AR_variants" format="tsv" from_work_dir="output_AR.variants.tsv" label="${tool.name} on ${on_string}: AR Variants"/>
<data name="AR_vcf" format="vcf" from_work_dir="output_AR.vcf" label="${tool.name} on ${on_string}: AR VCF"/>
<data name="MT_genes" format="tsv" from_work_dir="output_MT.genes.tsv" label="${tool.name} on ${on_string}: MT Genes"/>
<data name="MT_variants" format="tsv" from_work_dir="output_MT.variants.tsv" label="${tool.name} on ${on_string}: MT Variants"/>
<data name="MT_vcf" format="vcf" from_work_dir="output_MT.vcf" label="${tool.name} on ${on_string}: MT VCF"/>
<data name="XD_genes" format="tsv" from_work_dir="output_XD.genes.tsv" label="${tool.name} on ${on_string}: XD Genes"/>
<data name="XD_variants" format="tsv" from_work_dir="output_XD.variants.tsv" label="${tool.name} on ${on_string}: XD Variants"/>
<data name="XD_vcf" format="vcf" from_work_dir="output_XD.vcf" label="${tool.name} on ${on_string}: XD VCF"/>
<data name="XR_genes" format="tsv" from_work_dir="output_XR.genes.tsv" label="${tool.name} on ${on_string}: XR Genes"/>
<data name="XR_variants" format="tsv" from_work_dir="output_XR.variants.tsv" label="${tool.name} on ${on_string}: XR Variants"/>
<data name="XR_vcf" format="vcf" from_work_dir="output_XR.vcf" label="${tool.name} on ${on_string}: XR VCF"/>
<data name="html_output" format="html" from_work_dir="output.html" label="${tool.name} on ${on_string}: HTML"/>
<data name="json_output" format="json" from_work_dir="output.json" label="${tool.name} on ${on_string}: JSON"/>
<data name="analysis_file" label="Exomiser YML" from_work_dir="analysis.yml" format="txt"/>
<data name="log" format="txt" label="Exomiser log"/>
<data name="merged_sorted" format="csv" from_work_dir="merged_sorted.csv" label="${tool.name} on ${on_string}: Merged and Sorted"/>
<data name="top_ten" format="csv" from_work_dir="top_ten.csv" label="${tool.name} on ${on_string}: Top Ten"/>
</outputs>

<tests></tests>

<help>
The Exomiser is a Java program that finds potential disease-causing variants from whole-exome or whole-genome sequencing data.

Starting from a VCF file and a set of phenotypes encoded using the Human Phenotype Ontology (HPO) it will annotate, filter and prioritise likely causative variants. The program does this based on user-defined criteria such as a variant's predicted pathogenicity, frequency of occurrence in a population and also how closely the given phenotype matches the known phenotype of diseased genes from human and model organism data.

The functional annotation of variants is handled by Jannovar and uses UCSC KnownGene transcript definitions and hg19 genomic coordinates.

Variants are prioritised according to user-defined criteria on variant frequency, pathogenicity, quality, inheritance pattern, and model organism phenotype data. Predicted pathogenicity data is extracted from the dbNSFP resource. Variant frequency data is taken from the 1000 Genomes, ESP and ExAC datasets. Subsets of these frequency and pathogenicity data can be defined to further tune the analysis. Cross-species phenotype comparisons come from our PhenoDigm tool powered by the OWLTools OWLSim algorithm.

The Exomiser was developed by the Computational Biology and Bioinformatics group at the Institute for Medical Genetics and Human Genetics of the Charité - Universitätsmedizin Berlin, the Mouse Informatics Group at the Sanger Institute and other members of the Monarch initiative.
</help>

<citations>
<citation type="doi">10.1101/gr.160325.113</citation>
</citations>

</tool>

35 changes: 35 additions & 0 deletions moon_tools/exomiser/exomiser_merger.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
### An R script to take in the exomiser output, merge them together and sort.

#get the command line inputs
args <- commandArgs(TRUE)


#loop over the input files
count = 1
for(i in args) {
#if it's the first input file then just create a new data frame
if ( count == 1 ) {
exomiser_data <- read.table(args[count], header = TRUE, sep = '\t', comment.char = "")
count = count + 1
} else {
#if it is not the first input file then create a new data frame and concatenate with the old one
new_data <- setNames(read.table(args[count], header = TRUE, sep = '\t', comment.char = ""), names(exomiser_data))
exomiser_data <- rbind(exomiser_data, new_data)
count = count + 1
}
}


print(names(exomiser_data))
#sort the data by EXOMISER_GENE_COMBINED_SCORE
exomiser_data <- exomiser_data[order(-exomiser_data$EXOMISER_GENE_COMBINED_SCORE),]
#get just the top ten variants
top_ten <- exomiser_data[c(1:10),]

#write the merged and sorted dataframe to a csv

write.csv(exomiser_data, file = "merged_sorted.csv")

#write the top ten data to a csv

write.csv(top_ten, file = "top_ten.csv")
3 changes: 3 additions & 0 deletions moon_tools/exomiser/female.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FAM1 father 0 0 1 1
FAM1 mother 0 0 2 1
FAM1 proband father mother 2 2
17 changes: 17 additions & 0 deletions moon_tools/exomiser/hpo_from_moon.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash


HPOS=$(curl -X GET -d "user_token=j4yKzsZivJuCXzoxdUM6" -d "[email protected]" https://oregon.moon.diploid.com/samples/$1/patient-info | grep 'showterm?id=HP:' | sed "s/[<][^>]*[>]//g")

COUNT=1

for LINE in $HPOS; do
if [[ $LINE == *HP:* ]]; then
if [[ $COUNT == 1 ]]; then
COUNT=2
echo -n "'$LINE'" > hpo.txt
else
echo -n ",'$LINE'" >> hpo.txt
fi
fi
done
19 changes: 19 additions & 0 deletions moon_tools/exomiser/hpo_from_moon_python.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash


python moon_api.py -mode info -id $1

HPOS=$(grep 'showterm?id=HP:' info.txt | sed "s/[<][^>]*[>]//g")

COUNT=1

for LINE in $HPOS; do
if [[ $LINE == *HP:* ]]; then
if [[ $COUNT == 1 ]]; then
COUNT=2
echo -n "'$LINE'" > hpo.txt
else
echo -n ",'$LINE'" >> hpo.txt
fi
fi
done
3 changes: 3 additions & 0 deletions moon_tools/exomiser/male.ped
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
FAM1 father 0 0 1 1
FAM1 mother 0 0 2 1
FAM1 proband father mother 1 2
Loading

0 comments on commit 4aa39c9

Please sign in to comment.