diff --git a/moon_tools/Moon_filter_galaxy.py b/moon_tools/Moon_filter_galaxy.py
new file mode 100644
index 00000000..34947e08
--- /dev/null
+++ b/moon_tools/Moon_filter_galaxy.py
@@ -0,0 +1,594 @@
+"""
+ Filters the all variants file from MOON
+"""
+
+import sys
+import argparse
+
+def supply_args():
+ """Populates the arguments"""
+ #Initiate the parser
+ parser = argparse.ArgumentParser(description='Filters and populates All Rare Variants file from Moon.')
+ #Add the desired arguments
+ #The all rare variants file
+ parser.add_argument('variants', help='All Rare Variants file from Moon')
+ #The optional gene list
+ parser.add_argument('-geneList', help='Optional list of genes to further filter on.')
+ #The data files for the populator
+ parser.add_argument('-omim', help='OMIM data file.')
+ parser.add_argument('-orphConv', help='Orphanet conversion file.')
+ parser.add_argument('-orphPrev', help='Orphanet prevelence file.')
+ #Filter off option
+ parser.add_argument('-filterOff', action='store_const', const=True, help='Turns off the filter function.')
+ #Gene header name
+ parser.add_argument('-header', default='Gene', help='Header on the gene column of the file.')
+ #create the object with the arguments as atributes
+ args=parser.parse_args()
+ return args
+
+def build_array(tsv_file_name):
+ """Takes the tsv file and turns it into an array"""
+ #open the tsv file
+ tsv_file = open(tsv_file_name, 'r')
+
+ #start the array
+ tsv_array = []
+
+ #populate the array
+ #loop over each line
+ for line in tsv_file:
+ #turn the line into an array by splitting the lines along the tabs
+ array_line = line.split('\t')
+ #add the line array to the tsv array
+ tsv_array.append(array_line)
+
+ #close the file
+ tsv_file.close()
+
+ return tsv_array
+
+def find_index(header_line, header_name):
+ """Find the index of a given header in a given header line"""
+ header_array = header_line.split('\t')
+ count = 0
+ for entry in header_array:
+ if entry.strip() == header_name:
+ return count
+ else:
+ count += 1
+ print('Could not find \"' + header_name + '\" column. Check to make sure your data is labeled correctly, or change the Gene Column Header under Non-Moon Specifications.')
+ return -1
+
+def filter_by_nums(array, index, max_num):
+ """Filters an array by a numerical filter. takes an array, the index of the filter value, and the maximum passing value"""
+ filtered_array = []
+ for entry in array:
+ try:
+ entry_num = float(entry[index])
+ if entry_num <= max_num:
+ filtered_array.append(entry)
+ except ValueError:
+ pass
+ return filtered_array
+
+def filter_by_string(array, index, invalid_string):
+ """Filters an array by a string type filter. Takes an array, the index of the filter value and the string that means the entry should be removed"""
+ filtered_array = []
+ for entry in array:
+ entry_value = entry[index]
+ if entry_value != invalid_string:
+ filtered_array.append(entry)
+ return filtered_array
+
+def add_by_string(array_to, array_from, index, valid_string):
+ """Adds entrys to an array if they have the valid string and removes the added ones from the original array"""
+ for entry in array_from:
+ entry_value = entry[index]
+ if entry_value == valid_string:
+ array_to.append(entry)
+ array_from.remove(entry)
+
+
+#Gene list filter
+def build_list(file_name):
+ """Build a list of genes from a txt file name with one gene per line"""
+ #open the file
+ file = open(file_name, 'r')
+ #initiate list
+ my_list = []
+ #iterate over each line the file
+ for line in file:
+ #strip off the end line character
+ entry = line.strip()
+ #strip off any extra white space and turn all letters uppercase to make it not case sensitive
+ entry = entry.strip(' ').upper()
+ #check if the gene is now empty
+ if entry != '':
+ #add the clean uppcase gene to the list
+ my_list.append(entry)
+ #sort the list
+ my_list.sort()
+ #close the file
+ file.close()
+ #return the list
+ return my_list
+
+def new_split(string_list, delimeter):
+ """Split an already split string again by a new delimeter, takes a list (the output of the first split) and a new delimeter"""
+ #Since the genes are also split by semi colon, check each 'gene' to see if it has a semi colon in it
+ for gene in string_list:
+ if delimeter in gene:
+ #if it has a semicolon then remove that string from the list
+ string_list.remove(gene)
+ #split the string by the semicolon
+ new_genes = gene.split(delimeter)
+ #add the new splits back into the list
+ string_list = string_list + new_genes
+
+
+def gene_in_line(gene_list, line, gene_index):
+ """returns true or false depending on if any of the genes in this line are in the gene list. parameters: gene_list of type list, line of type string, and gene_index, where the gene index is the position of the gene list in the TSV"""
+ #turn the line uppercase to make the search non-case sensitive
+ line_upper = line.upper()
+ #split the line by tab and take the entry at the gene index
+ line_genes = line_upper.split('\t')[gene_index]
+ #split the line by commas to creat a list of genes
+ line_genes_list = line_genes.split(',')
+ #now split on the semicolon
+ new_split(line_genes_list, ';')
+ #check each gene in the line's gene list to see if it is in the given gene list
+ for gene in line_genes_list:
+ #strip down the gene to the essentials
+ gene = gene.strip('\n')
+ gene = gene.strip(' ')
+ gene = gene.strip('"')
+ #check if the gene is in the gene list
+ if gene in gene_list:
+ #if the gene is in the gene list then simply return true
+ return True
+ #if the each of the genes are check and none force a true return then return False
+ return False
+
+def filter_tsv_by_list(tsv_file_name, gene_list, gene_header = 'Gene'):
+ """Takes a tsv and a list and writes a new tsv with only those tsv lines who include genes from the gene list"""
+ #open the tsv
+ tsv = open(tsv_file_name, 'r')
+ #create the new tsv, with a name based on the old tsv name
+ filtered_tsv_name = 'RelGenes.tsv'
+ #tsv_file_name.split('.')[0] + '_RelGenes.tsv'
+ filtered_tsv = open(filtered_tsv_name, 'w')
+ #Get the first line from the old tsv as a header
+ first_line = tsv.readline()
+ #find the index of the GENES column
+ #header_array = first_line.split('\t')
+ #Find Gene index
+ gene_index = find_index(first_line, gene_header)
+ #Add the first line from the old tsv to the new tsv as a header
+ filtered_tsv.write(first_line)
+ #Loop over each line in the old tsv
+ for line in tsv:
+ #check if the line has a valid gene in its gene list
+ if gene_in_line(gene_list, line, gene_index):
+ #if so add it to the new tsv
+ filtered_tsv.write(line)
+ #close the files
+ tsv.close()
+ filtered_tsv.close()
+ #return the new file name
+ return filtered_tsv_name
+
+def filter_moon_tsv(moon_tsv):
+ """Filters the Moon tsv"""
+ moon_file = open(moon_tsv, 'r')
+
+ filtered_moon_file_name = 'filtered.tsv'
+ #moon_tsv.split('.')[0] + '_filtered.tsv'
+ filtered_moon_file = open(filtered_moon_file_name, 'w')
+
+ header_line = moon_file.readline()
+ filtered_moon_file.write(header_line)
+ moon_file.close()
+
+
+ moon_array = build_array(moon_tsv)
+ #create array of variants we definately want to take a closer look at
+ white_list_array = []
+
+ #Add all the moon variants that were not filtered out
+ moon_filter_index = find_index(header_line, 'Filter')
+ add_by_string(white_list_array, moon_array, moon_filter_index, '')
+ #Add all clinVar path and likely path
+ clinVar_index = find_index(header_line, 'ClinVar')
+ add_by_string(white_list_array, moon_array, clinVar_index, 'Pathogenic')
+ add_by_string(white_list_array, moon_array, clinVar_index, 'Likely pathogenic')
+ #Add all de novo variants
+ de_novo_index = find_index(header_line, 'De Novo')
+ add_by_string(white_list_array, moon_array, de_novo_index, 'TRUE')
+ #Add all compound het variants
+ comp_het_index = find_index(header_line, 'Compound Het.')
+ add_by_string(white_list_array, moon_array, comp_het_index, 'TRUE')
+
+ #For the variants not yet added to the white list filter out those we don't need to look at
+ #gnomad frequency over 1%
+ gnomad_freq_index = find_index(header_line, 'gnomAD frequency')
+ moon_array = filter_by_nums(moon_array, gnomad_freq_index, .01)
+ #diploid frequency over 5%
+ diploid_freq_index = find_index(header_line, 'Diploid frequency')
+ moon_array = filter_by_nums(moon_array, diploid_freq_index, .05)
+ #More than 5 homozygotes in gnomad
+ homozygotes_index = find_index(header_line, 'gnomAD homs')
+ moon_array = filter_by_nums(moon_array, homozygotes_index, 5)
+ #No associated disorder
+ disorder_index = find_index(header_line, 'Disorder')
+ moon_array = filter_by_string(moon_array, disorder_index, 'No associated disorder')
+ #No coding or splicing effect and not a VUS on clinVar
+ add_by_string(white_list_array, moon_array, clinVar_index, 'VUS')
+ effect_index = find_index(header_line, 'Effect')
+ moon_array = filter_by_string(moon_array, effect_index, '')
+
+ #put the arrays back together
+ total_array = white_list_array + moon_array
+ #flag low read variants
+ reads_index = find_index(header_line, 'Depth')
+
+ for entry in total_array:
+ count = 1
+
+ for value in entry:
+ if count < len(entry):
+ filtered_moon_file.write(value + '\t')
+ count += 1
+ else:
+ try:
+ if float(entry[reads_index]) < 10:
+ value = value.strip('\n') + '\t LOW READ DEPTH \n'
+ except ValueError:
+ pass
+ filtered_moon_file.write(value)
+
+ filtered_moon_file.close()
+
+ return filtered_moon_file_name
+
+#Populator
+def build_dictionary(data_file_name, search_index, info_index, filter_on = False, filter_index = 32, filter_by = 'OMIM'):
+ """Builds a dictionary of search term and info entries.
+ Returns a dictionary.
+ Parameters: File - tab delimited, index of the search term, index of the information,
+ Optional: turn on the filter to filter by the value in another term. filter on (true/false), index of the term to filter by, term needed to pass the filter.
+ """
+ #open the file with the needed data
+ data_file = open(data_file_name, 'r')
+
+ #Start the data dictionary
+ data_dictionary = {}
+
+ #populate the dictionary
+ #loop over each entry
+ for line in data_file:
+ #split the entry into an array where each spot is one peice of information from that entry
+ data_entry = line.split('\t')
+ #check to be sure the entry has all the needed information
+ if len(data_entry) > max(search_index, info_index):
+ #extract the needed information from the array and assign to variables
+ search = data_entry[search_index].strip()
+ info = data_entry[info_index].strip()
+ #.replace(',',';')
+ #check if using the filter and the entry is long enough to use the filter
+ if filter_on and len(data_entry) > filter_index:
+ #get the info from the entry at the filter index
+ filter_info = data_entry[filter_index].strip(' ')
+ #if the filter does not pass then remove the info from the dictionary
+ if filter_info != filter_by:
+ info = ''
+ #check if the search term is already in the dictionary
+ if search in data_dictionary:
+ #if the search is in the dictionary then append the new value to the list and the info is not already in the dictionary then add it to the list
+ if info not in data_dictionary[search] and info != '':
+ data_dictionary[search].append(info)
+ #check if the empty info is in the dictionary and remove it
+ if '' in data_dictionary[search]:
+ data_dictionary[search].remove('')
+ #if the search term is not in the dictionary add it with its info
+ else:
+ data_dictionary[search] = [info]
+ #if the entry is not long enough to have both the search term and the info term then print a message to the command line so the problem entry can be examined
+ #ignore the comment lines and that one OMIM comment line is keeps poping up anyway.
+ elif data_entry[0].startswith('#') or data_entry[0].startswith('\xef\xbb\xbf# Copyright'):
+ pass
+ else:
+ print("Problem with entry:")
+ print(data_entry)
+ #remove the empty entry from the dictionary
+ if '' in data_dictionary:
+ del data_dictionary['']
+ #close the file
+ data_file.close()
+
+ return data_dictionary
+
+def add_info(search_array, info_dictionary, search_index):
+ """adds information from the data dictionary to the array made from the tsv
+ Parameters: two tiered array made from a tsv, dictionary with string keys and list values, index of the term to search on in the array/tsv
+ """
+ #loop over each entry in the array from the tsv file/ array
+ #initiate count
+ count = 0
+ while count < len(search_array):
+ #find the next entry
+ entry = search_array[count]
+ #check to make sure the entry is long enough to include the seach term
+ if len(entry) > search_index:
+ #identify the searching item in question
+ search = entry[search_index].strip(' ')
+ else:
+ #if the entry is not long enough then give a dumy search term
+ search = "None"
+ #check that the location is in the dictionary
+ if search in info_dictionary:
+ #retrieve the desired information from the dictionary
+ info = info_dictionary[search]
+ #loop over each of the entries in info, popping them out after each one
+ while len(info) > 1:
+ #loop through the info array and add new line to the array
+ #find the index of the current entry and add one to get just below
+ index = count + 1
+ #form new a line just below the current entry with the current peice of info
+ search_array.insert(index, [info.pop(0), '\n'])
+ #advance the count of the search array so that the new line is skipped
+ count = count + 1
+ #with only one entry left add it to the entry in-line
+ entry.insert(0, info[0])
+ #if the entry is deemed not a real entry then add a new empty column to the array to keep things orderly
+ else:
+ entry.insert(0, '')
+ #advance the count to the next entry line
+ count = count + 1
+
+def write_tsv_from_array(two_tier_array, file_name):
+ """creates a tsv from a two tiered array
+ Parameters: two tiered array, name for new tsv file
+ """
+ #create a new file
+ new_tsv = open(file_name, 'w')
+
+ #loop over each line in the array
+ for line in two_tier_array:
+ #loop over each entry in the line
+ for entry in line:
+ #for each entry write it to the new file and add a comma at the end
+ if line.index(entry) < len(line)-1:
+ new_tsv.write(entry + '\t')
+ #for the final entry don't add a comma to the end.
+ else:
+ new_tsv.write(entry)
+
+ #close the file
+ new_tsv.close()
+
+ return file_name
+
+def add_data_from(array_name, data_file_name, data_search_index, tsv_search_index, data_to_add_index, filter_on = False, filter_index = 32, filter_by = 'OMIM'):
+ """Returns a tsv like the given tsv but with new data from a given data file
+ parameters: tsv file name that needs data added
+ tab delimited file name with the desired information
+ index of the search term in the data file
+ index of the search term in the tsv
+ index of the data in data file
+ Optional: if the data dictionary needs to be filtered then:
+ filter on (true/false)
+ index of the filter term in the data file
+ term needed to pass the filter
+ """
+ #build the dictionary
+ data_dict = build_dictionary(data_file_name, data_search_index, data_to_add_index, filter_on, filter_index, filter_by)
+ #add the infomation from the dictionary to the array
+ add_info(array_name, data_dict, tsv_search_index)
+
+ #return the new tsv file name
+ return array_name
+
+def clean(tsv_array, out_put_file_name):
+ """takes a tsv file created by the populate function and returns a tsv file that has column headers"""
+
+ #add the headers
+ tsv_array[0][0] = 'Inheritance/Disorder: OMIM'
+ tsv_array[0][1] = 'Prevelance 2016'
+ tsv_array[0][2] = 'Disorder: Orphanet'
+ tsv_array[0][3] = 'Orphanum'
+
+ #turn the array back into a tsv named the input name
+ new_tsv = write_tsv_from_array(tsv_array, out_put_file_name)
+ #return the new tsv
+ return new_tsv
+
+def populate(tsv_file_name, OMIM_data, Orphadata_1, Orphadata_2, gene_header = 'Gene'):
+ """Takes a tsv file and populates it from OMIM's genemap2.txt and orphanet
+ adds: orphanet number based on the gene symbol
+ adds: disease, OMIM number and prevelace based on orphanet number
+ adds: OMIM inheritance based on OMIM number
+ removes entries with only OMIM numbers to clean up the tsv
+ """
+ #Gene to Orphanum
+ #Orphanum to Disease
+ #Orphanum to prevelance
+ #Gene to Inheritance/OMIM Disorder
+
+
+ #CHECK THESE NUMBERS AND DATA FILES FIRST IN CASE OF NOT WORKING
+
+ #Indecies
+
+ #orphanet
+ #gene to disease database
+ gene_orphanet = 0
+ orphanum = 2
+ #omim_number_orphanet = 33
+ disease_orphanet = 1
+ #omim_validation_orphanet = 32
+ #prevelance database
+ prevelance_orphanet = 27
+ validation_orphanet = 36
+ orphanum_prev = 5
+
+ #OMIM
+ omim_number_omim = 5
+ inheritance_omim = 12
+ OMIM_gene = 8
+
+ #tsv
+ #gene_tsv = gene_index
+
+ #data files
+ #Orphanet cross referencer. Gene to disorder file
+ #Orphadata_1 = 'orphanet_data_symbol_to_disorder.txt'
+ #Orphanet prevelance.
+ #Orphadata_2 = 'orphanet_data_1.txt'
+ #OMIM data. genemap2.txt
+ #OMIM_data = 'genemap2.txt'
+
+
+ array_1 = build_array(tsv_file_name)
+ #get the gene index
+ first_line = ''
+ for entry in array_1[0]:
+ first_line = first_line + entry + ('\t')
+ gene_tsv = find_index(first_line, gene_header)
+ #get the Orphanet number from the gene symbol using the orphanet gene to disease reference
+ array_2 = add_data_from(array_1, Orphadata_1, gene_orphanet, gene_tsv, orphanum)
+ #update the tsv indecies
+ orphanum_tsv = 0
+ gene_tsv += 1
+ #get the disease from the orphanet number using the orphanet gene to disease reference
+ array_3 = add_data_from(array_2, Orphadata_1, orphanum, orphanum_tsv, disease_orphanet)
+ #update the needed tsv indecies
+ orphanum_tsv += 1
+ gene_tsv += 1
+ #get the prevelance from the orphanet number using the Orphanet database, filter to make sure only validated prevelance data is retrieved
+ array_4 = add_data_from(array_3, Orphadata_2, orphanum_prev, orphanum_tsv, prevelance_orphanet, filter_on = True, filter_index = validation_orphanet, filter_by = 'Validated')
+ #update needed indices
+ gene_tsv += 1
+ #get the inheritance/disorder from the gene using the OMIM data
+ array_5 = add_data_from(array_4, OMIM_data, OMIM_gene, gene_tsv, inheritance_omim)
+
+ #Create a name for the output tsv by adding 'populated' to the end of the name
+
+ new_tsv_name = 'populated.tsv'
+ #tsv_file_name.split('.')[0] + '_populated.tsv'
+ clean(array_5, new_tsv_name)
+
+"""
+def filter_manual(moon_tsv):
+ #Seeks user input for filtering a moon file
+ moon_file = open(moon_tsv, 'r')
+ filtered_moon_file_name = moon_tsv.split('.')[0] + '_filtered.tsv'
+ filtered_moon_file = open(filtered_moon_file_name, 'w')
+
+ header_line = moon_file.readline()
+ filtered_moon_file.write(header_line)
+ moon_file.close()
+
+ moon_array = build_array(moon_tsv)
+ #create array of variants we definately want to take a closer look at
+ white_list_array = []
+
+ #initiate a while counter
+ again = True
+ while again:
+ mode = input('Filter type (Add/Remove/Under):')
+ if mode.lower() == 'add':
+ header = input('What is the header on the colunm you want to search by?')
+ valid_string = input('What is the entry type that you want to add?')
+ index = find_index(header_line, header)
+ add_by_string(white_list_array, moon_array, index, valid_string)
+ elif mode.lower() == 'remove':
+ header = input('What is the header on the colunm you want to search by?')
+ invalid_string = input('What is the entry type that you want to remove?')
+ index = find_index(header_line, header)
+ moon_array = filter_by_string(moon_array, index, invalid_string)
+ elif mode.lower() == 'under':
+ header = input('What is the header on the colunm you want to filter on?')
+ max_num_str = input('What is the maximum value you want to keep?')
+ max_num = float(max_num_str)
+ index = find_index(header_line, header)
+ moon_array = filter_by_nums(moon_array, index, max_num)
+ else:
+ print('To pick entries you want to keep regardless of future filters that may remove them use ADD. \nTo remove entries based on a word colunm use REMOVE, for instance remove all entries with \'No associated disorder\' under Disorder. \nTo remove entries based on a number type column use UNDER, for instance you want to keep all entries with a diploid frequency under .5')
+ another = input('Do you want to add another filter? (Yes/No)')
+ if another.lower() == 'no':
+ again = False
+ elif another.lower() == 'yes':
+ again = True
+ else:
+ again = False
+ confirm = input('That was not a Yes or No, so it will be taken as a No. \nIf you would like to add another filter please type Yes now.')
+ if confirm.lower() == 'yes':
+ again = True
+ #put the arrays back together
+ total_array = white_list_array + moon_array
+ #flag low read variants
+ reads_index = find_index(header_line, 'Depth')
+
+ for entry in total_array:
+ count = 1
+
+ for value in entry:
+ if count < len(entry):
+ filtered_moon_file.write(value + '\t')
+ count += 1
+ else:
+ try:
+ if float(entry[reads_index]) < 10:
+ value = value.strip('\n') + '\t LOW READ DEPTH \n'
+ except ValueError:
+ pass
+ filtered_moon_file.write(value)
+
+ filtered_moon_file.close()
+
+ return filtered_moon_file_name
+
+
+"""
+
+def main():
+ args = supply_args()
+ tsv_file = args.variants
+ moon_filtered = tsv_file
+ if not args.filterOff:
+ moon_filtered = filter_moon_tsv(tsv_file)
+ if args.geneList:
+ gene_list = build_list(args.geneList)
+ moon_filtered = filter_tsv_by_list(moon_filtered, gene_list, args.header)
+ if args.omim and args.orphConv and args.orphPrev:
+ omim_data = args.omim
+ orphanet_conv_data = args.orphConv
+ orphanet_prev_data = args.orphPrev
+ populate(moon_filtered, args.omim, args.orphConv, args.orphPrev, args.header)
+ """
+ if len(args) == 1:
+ file_name = input('File name: ')
+ mode = input('Mode (Filter/Gene/Populate): ')
+ if mode.lower() == 'gene':
+ gene_list_file = input('Gene list:')
+ gene_header = input('Header of Gene Column: ')
+ gene_list = build_list(gene_list_file)
+ filter_tsv_by_list(file_name, gene_list, gene_header)
+ elif mode.lower() == 'populate':
+ gene_index_str = input('Index of Gene Column: \n(Column A has index 0 \nMoon index is 7, Galaxy Filtered Variant files index is 0)\n')
+ gene_index = int(gene_index_str)
+ populate(file_name, gene_index)
+ elif mode.lower() == 'filter':
+ filter_manual(file_name)
+
+ else:
+ tsv_file = args[1]
+ moon_filtered = filter_moon_tsv(tsv_file)
+ if len(args) > 2:
+ gene_list = build_list(args[2])
+ moon_filtered = filter_tsv_by_list(moon_filtered, gene_list)
+ populate(moon_filtered)
+ """
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/moon_tools/de_dup.py b/moon_tools/de_dup.py
new file mode 100644
index 00000000..e100b3c9
--- /dev/null
+++ b/moon_tools/de_dup.py
@@ -0,0 +1,59 @@
+""" de_dup.py
+ Script that takes two text files with lists of genes and returns a text file with a list of the genes from the medical list with duplicates removed
+ Eleanor Campbell
+"""
+
+import sys
+
+def build_list(file_name):
+
+ file = open(file_name, 'r')
+
+ my_list = []
+
+ for line in file:
+ entry = line.strip()
+ entry = entry.strip(' ').upper()
+ my_list.append(entry)
+
+ my_list.sort()
+
+ file.close()
+
+ return my_list
+
+def de_dup(med_list, hpo_list):
+
+ new_list = []
+
+ for gene in med_list:
+ if gene not in hpo_list:
+ new_list.append(gene)
+
+ return new_list
+
+
+def write_list(my_list, new_name):
+
+ file = open(new_name, 'w')
+
+ for entry in my_list:
+ file.write(entry + '\n')
+
+ file.close()
+
+
+def main(med_genes, hpo_genes):
+ med_gene_list = build_list(med_genes)
+ hpo_gene_list = build_list(hpo_genes)
+ de_duped_list = de_dup(med_gene_list, hpo_gene_list)
+
+ new_name = 'output.txt'
+ #hpo_genes.split('.')[0] + '_de_duped.txt'
+
+ write_list(de_duped_list, new_name)
+
+if __name__ == "__main__":
+ args = sys.argv
+ main(args[1], args[2])
+
\ No newline at end of file
diff --git a/moon_tools/de_dup.xml b/moon_tools/de_dup.xml
new file mode 100644
index 00000000..d7e35ffe
--- /dev/null
+++ b/moon_tools/de_dup.xml
@@ -0,0 +1,21 @@
+
+
+ Removes all the HPO genes from the medical exome list
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Takes the medical exome and removes all genes which have already been identified in the HPO gene list.
+
+
\ No newline at end of file
diff --git a/moon_tools/exomiser/application.properties b/moon_tools/exomiser/application.properties
new file mode 100644
index 00000000..5bfa9125
--- /dev/null
+++ b/moon_tools/exomiser/application.properties
@@ -0,0 +1,58 @@
+#
+# The Exomiser - A tool to annotate and prioritize genomic variants
+#
+# Copyright (c) 2016-2018 Queen Mary University of London.
+# Copyright (c) 2012-2016 Charit� Universit�tsmedizin Berlin and Genome Research Ltd.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program. If not, see .
+#
+
+#root path where data is to be downloaded and worked on
+#it is assumed that all the files required by exomiser listed in this properties file
+#will be found in the data directory unless specifically overridden here.
+exomiser.data-directory=/place_holder_directory/data/
+### hg19 assembly ###
+exomiser.hg19.data-version=1805
+#transcript source will default to ensembl. Can define as ucsc/ensembl/refseq
+#exomiser.hg19.transcript-source=ensembl
+#exomiser.hg19.data-directory=${exomiser.data-directory}/${exomiser.hg19.data-version}_hg19
+#location of CADD/REMM Tabix files - you will need these for analysis of non-coding variants.
+#CADD can be downloaded from http://cadd.gs.washington.edu/download - v1.3 has been tested.
+#REMM can be downloaded from https://charite.github.io/software-remm-score.html
+#local frequencies are required to be normalised in the same manner as the input VCF and frequency values must be percentages.
+#
+#You will require the tsv.gz and tsv.gz.tbi (tabix) file pairs.
+#Un-comment and add the full path to the relevant tsv.gz files if you want to enable these.
+#exomiser.hg19.cadd-snv-path=${exomiser.hg19.data-directory}/whole_genome_SNVs.tsv.gz
+#exomiser.hg19.cadd-in-del-path=${exomiser.hg19.data-directory}/InDels.tsv.gz
+#exomiser.hg19.remm-path=${exomiser.hg19.data-directory}/remmData.tsv.gz
+#exomiser.hg19.local-frequency-path=${exomiser.hg19.data-directory}/local_frequency_test.tsv.gz
+### hg38 assembly ###
+# To enable analysis of samples called against the hg38 assembly copy the hg19 above and just replace the hg19 with hg38
+#exomiser.hg38.data-version=1805
+### phenotypes ###
+exomiser.phenotype.data-version=1807
+#exomiser.phenotype.data-directory=${exomiser.data-directory}/${exomiser.phenotype.data-version}_phenotype
+#String random walk data file
+#exomiser.phenotype.random-walk-file-name=/home/users/campbena/rw_string_9_05.mv
+#exomiser.phenotype.random-walk-index-file-name=rw_string_9_05_id2index.gz
+### caching ###
+#If you're running exomiser in batch mode there might be some performance benefit
+#if you enable caching. The 'simple' option will continue to store data in memory *without*
+#limit - this means for really long-running batch jobs and/or whole genomes you may run out of memory.
+#If this is likely choose the caffeine option and uncomment spring.cache.caffeine.spec and adjust the cache size
+#to your requirements
+#none/simple/caffeine
+#spring.cache.type=none
+#spring.cache.caffeine.spec=maximumSize=60000
\ No newline at end of file
diff --git a/moon_tools/exomiser/exomiser.xml b/moon_tools/exomiser/exomiser.xml
new file mode 100644
index 00000000..ba8a6cee
--- /dev/null
+++ b/moon_tools/exomiser/exomiser.xml
@@ -0,0 +1,153 @@
+
+ A Tool to Annotate and Prioritize Exome Variants
+
+ gatk4
+
+ > $log &&
+ export JAVA8_PATH="\$JAVA8_PATH" &&
+ java -version >> $log &&
+ ln -s $proband index_needed.vcf &&
+ gatk IndexFeatureFile -F index_needed.vcf 2>> $log &&
+ gatk RenameSampleInVcf -I index_needed.vcf -O proband.vcf --NEW_SAMPLE_NAME proband 2>> $log &&
+ #if $trio.trio_on:
+ gatk RenameSampleInVcf -I $trio.mother -O mother.vcf --NEW_SAMPLE_NAME mother 2>> $log &&
+ gatk RenameSampleInVcf -I $trio.father -O father.vcf --NEW_SAMPLE_NAME father 2>> $log &&
+ gatk CombineGVCFs -O trio.vcf -V proband.vcf -V mother.vcf -V father.vcf
+ #if $trio.reference_source.reference_source_selector != "no_ref"
+ #if $trio.reference_source.reference_source_selector != "history"
+ --reference ${trio.reference_source.reference_sequence.fields.path}
+ #else
+ --reference ${trio.reference_source.reference_sequence}
+ #end if
+ #end if
+ 2>> $log &&
+ ln -s $__tool_directory__/${trio.gender}.ped ${trio.gender}.ped &&
+ #end if
+
+ #if $hpo_selector.source == "moon":
+ ln -s $__tool_directory__/moon_api.py moon_api.py &&
+ $__tool_directory__/hpo_from_moon_python.sh $hpo_selector.moon_id 2>> $log &&
+ #else if $hpo_selector.source == "text_input":
+ echo '$hpo_selector.hpo_terms' > hpo_pre.txt &&
+ sed "s/__sq__/'/g" hpo_pre.txt > hpo.txt &&
+ #else:
+ ln -s $hpo_selector.hpo_terms hpo.txt &&
+ #end if
+
+
+ cat
+ #if $trio.trio_on:
+ $__tool_directory__/trio_pre_${trio.gender}.yml
+ #else:
+ $__tool_directory__/pre.yml
+ #end if
+
+ hpo.txt
+
+ $__tool_directory__/post.yml > analysis.yml &&
+
+ \$JAVA8_PATH -Xms2g -Xmx4g -jar $__tool_directory__/exomiser-cli-11.0.0/exomiser-cli-11.0.0.jar --analysis analysis.yml >> $log &&
+ Rscript $__tool_directory__/exomiser_merger.R output_AR.variants.tsv output_AD.variants.tsv output_XD.variants.tsv output_XR.variants.tsv output_MT.variants.tsv >> $log
+]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+The Exomiser is a Java program that finds potential disease-causing variants from whole-exome or whole-genome sequencing data.
+
+Starting from a VCF file and a set of phenotypes encoded using the Human Phenotype Ontology (HPO) it will annotate, filter and prioritise likely causative variants. The program does this based on user-defined criteria such as a variant's predicted pathogenicity, frequency of occurrence in a population and also how closely the given phenotype matches the known phenotype of diseased genes from human and model organism data.
+
+The functional annotation of variants is handled by Jannovar and uses UCSC KnownGene transcript definitions and hg19 genomic coordinates.
+
+Variants are prioritised according to user-defined criteria on variant frequency, pathogenicity, quality, inheritance pattern, and model organism phenotype data. Predicted pathogenicity data is extracted from the dbNSFP resource. Variant frequency data is taken from the 1000 Genomes, ESP and ExAC datasets. Subsets of these frequency and pathogenicity data can be defined to further tune the analysis. Cross-species phenotype comparisons come from our PhenoDigm tool powered by the OWLTools OWLSim algorithm.
+
+The Exomiser was developed by the Computational Biology and Bioinformatics group at the Institute for Medical Genetics and Human Genetics of the Charité - Universitätsmedizin Berlin, the Mouse Informatics Group at the Sanger Institute and other members of the Monarch initiative.
+
+
+
+ 10.1101/gr.160325.113
+
+
+
+
diff --git a/moon_tools/exomiser/exomiser_merger.R b/moon_tools/exomiser/exomiser_merger.R
new file mode 100644
index 00000000..d489a3cc
--- /dev/null
+++ b/moon_tools/exomiser/exomiser_merger.R
@@ -0,0 +1,35 @@
+### An R script to take in the exomiser output, merge them together and sort.
+
+#get the command line inputs
+args <- commandArgs(TRUE)
+
+
+#loop over the input files
+count = 1
+for(i in args) {
+ #if it's the first input file then just create a new data frame
+ if ( count == 1 ) {
+ exomiser_data <- read.table(args[count], header = TRUE, sep = '\t', comment.char = "")
+ count = count + 1
+ } else {
+ #if it is not the first input file then create a new data frame and concatenate with the old one
+ new_data <- setNames(read.table(args[count], header = TRUE, sep = '\t', comment.char = ""), names(exomiser_data))
+ exomiser_data <- rbind(exomiser_data, new_data)
+ count = count + 1
+ }
+}
+
+
+print(names(exomiser_data))
+#sort the data by EXOMISER_GENE_COMBINED_SCORE
+exomiser_data <- exomiser_data[order(-exomiser_data$EXOMISER_GENE_COMBINED_SCORE),]
+#get just the top ten variants
+top_ten <- exomiser_data[c(1:10),]
+
+#write the merged and sorted dataframe to a csv
+
+write.csv(exomiser_data, file = "merged_sorted.csv")
+
+#write the top ten data to a csv
+
+write.csv(top_ten, file = "top_ten.csv")
\ No newline at end of file
diff --git a/moon_tools/exomiser/female.ped b/moon_tools/exomiser/female.ped
new file mode 100644
index 00000000..b2745254
--- /dev/null
+++ b/moon_tools/exomiser/female.ped
@@ -0,0 +1,3 @@
+FAM1 father 0 0 1 1
+FAM1 mother 0 0 2 1
+FAM1 proband father mother 2 2
\ No newline at end of file
diff --git a/moon_tools/exomiser/hpo_from_moon.sh b/moon_tools/exomiser/hpo_from_moon.sh
new file mode 100644
index 00000000..e23c69db
--- /dev/null
+++ b/moon_tools/exomiser/hpo_from_moon.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+
+HPOS=$(curl -X GET -d "user_token=j4yKzsZivJuCXzoxdUM6" -d "user_email=potteram@ohsu.edu" https://oregon.moon.diploid.com/samples/$1/patient-info | grep 'showterm?id=HP:' | sed "s/[<][^>]*[>]//g")
+
+COUNT=1
+
+for LINE in $HPOS; do
+ if [[ $LINE == *HP:* ]]; then
+ if [[ $COUNT == 1 ]]; then
+ COUNT=2
+ echo -n "'$LINE'" > hpo.txt
+ else
+ echo -n ",'$LINE'" >> hpo.txt
+ fi
+ fi
+done
\ No newline at end of file
diff --git a/moon_tools/exomiser/hpo_from_moon_python.sh b/moon_tools/exomiser/hpo_from_moon_python.sh
new file mode 100644
index 00000000..6c70282e
--- /dev/null
+++ b/moon_tools/exomiser/hpo_from_moon_python.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+
+python moon_api.py -mode info -id $1
+
+HPOS=$(grep 'showterm?id=HP:' info.txt | sed "s/[<][^>]*[>]//g")
+
+COUNT=1
+
+for LINE in $HPOS; do
+ if [[ $LINE == *HP:* ]]; then
+ if [[ $COUNT == 1 ]]; then
+ COUNT=2
+ echo -n "'$LINE'" > hpo.txt
+ else
+ echo -n ",'$LINE'" >> hpo.txt
+ fi
+ fi
+done
diff --git a/moon_tools/exomiser/male.ped b/moon_tools/exomiser/male.ped
new file mode 100644
index 00000000..20e64cdb
--- /dev/null
+++ b/moon_tools/exomiser/male.ped
@@ -0,0 +1,3 @@
+FAM1 father 0 0 1 1
+FAM1 mother 0 0 2 1
+FAM1 proband father mother 1 2
\ No newline at end of file
diff --git a/moon_tools/exomiser/moon_api.py b/moon_tools/exomiser/moon_api.py
new file mode 100644
index 00000000..aca74479
--- /dev/null
+++ b/moon_tools/exomiser/moon_api.py
@@ -0,0 +1,72 @@
+import argparse
+import requests
+
+
+def supply_args():
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('-id', help="Moon ID")
+ parser.add_argument('-mode', help='info, post, or analyse')
+ parser.add_argument('-snp', help='SNP VCF file')
+ parser.add_argument('-cnv', help='CNV VCF file')
+ parser.add_argument('-age', help='Patient Age')
+ parser.add_argument('-gender', help='Patient Gender')
+ parser.add_argument('-consang', help='Is the patient Consanguinious?')
+ parser.add_argument('-hp', help="HPO Terms File")
+ parser.add_argument('-family', help="Family Members File", default="none")
+ args = parser.parse_args()
+ return args
+
+
+
+def get_patient_info(moon_id):
+ parameters = {"user_token": "iSUQvGmVNSjq834g9fP5", "user_email": "campbena@ohsu.edu"}
+
+ patient_url = "https://oregon.moon.diploid.com/samples/" + moon_id + "/patient-info"
+ patient_info = requests.get(patient_url, params = parameters)
+
+
+ info = open("info.txt", 'w')
+
+
+ info.write(patient_info.content)
+ info.close()
+ return "info.txt"
+
+def post_sample(snp, cnv, age, gender, consang, hp, family = "none" ):
+ at_snp = "@/Users/campbena/Documents/galaxy-dev/tools/my_tools/" + snp
+ at_cnv = "@/Users/campbena/Documents/galaxy-dev/tools/my_tools/" + cnv
+ int_age = int(age)
+ parameters = {"user_token": "iSUQvGmVNSjq834g9fP5", "user_email": "campbena@ohsu.edu", "snp_vcf_file": at_snp, "sv_vcf_file": at_cnv, "age": age, "gender": gender, "is_consanguinous": consang, "hpo_terms": hp, "family_members": family}
+
+
+ if family == "none":
+ parameters.remove("family_members")
+
+ patient_url = "https://oregon.moon.diploid.com/samples.json"
+
+ post_to_moon = requests.post(patient_url, params = parameters)
+ print(post_to_moon.content)
+ print(parameters)
+
+ return post_to_moon.content
+
+def analyse_sample(moon_id):
+ parameters = {"user_token": "iSUQvGmVNSjq834g9fP5", "user_email": "campbena@ohsu.edu"}
+
+ patient_url = "https://oregon.moon.diploid.com/samples/" + moon_id + "/analysis.json"
+
+ analyse_now = requests.post(patient_url, params = parameters)
+
+ return analyse_now.content
+
+def main():
+ args = supply_args()
+ if args.mode == "info":
+ return get_patient_info(args.id)
+ if args.mode == "post":
+ return post_sample(args.snp, args.cnv, args.age, args.gender, args.consang, args.hp, args.family)
+ if args.mode == "analyse":
+ return analyse_sample(args.id)
+
+if __name__ == "__main__":
+ main()
diff --git a/moon_tools/exomiser/other.ped b/moon_tools/exomiser/other.ped
new file mode 100644
index 00000000..8c10bc1e
--- /dev/null
+++ b/moon_tools/exomiser/other.ped
@@ -0,0 +1,3 @@
+FAM1 father 0 0 1 1
+FAM1 mother 0 0 2 1
+FAM1 proband father mother 0 2
\ No newline at end of file
diff --git a/moon_tools/exomiser/post.yml b/moon_tools/exomiser/post.yml
new file mode 100644
index 00000000..c928c087
--- /dev/null
+++ b/moon_tools/exomiser/post.yml
@@ -0,0 +1,112 @@
+]
+ # These are the default settings, with values representing the maximum minor allele frequency in percent (%) permitted for an
+ # allele to be considered as a causative candidate under that mode of inheritance.
+ # If you just want to analyse a sample under a single inheritance mode, delete/comment-out the others. For AUTOSOMAL_RECESSIVE
+ # or X_RECESSIVE ensure *both* relevant HOM_ALT and COMP_HET modes are present.
+ # In cases where you do not want any cut-offs applied an empty map should be used e.g. inheritanceModes: {}
+ inheritanceModes: {
+ AUTOSOMAL_DOMINANT: 0.1,
+ AUTOSOMAL_RECESSIVE_HOM_ALT: 0.1,
+ AUTOSOMAL_RECESSIVE_COMP_HET: 2.0,
+ X_DOMINANT: 0.1,
+ X_RECESSIVE_HOM_ALT: 0.1,
+ X_RECESSIVE_COMP_HET: 2.0,
+ MITOCHONDRIAL: 0.2
+ }
+ #FULL, SPARSE or PASS_ONLY
+ analysisMode: PASS_ONLY
+ #Possible frequencySources:
+ #Thousand Genomes project - http://www.1000genomes.org/ (THOUSAND_GENOMES)
+ #TOPMed - https://www.nhlbi.nih.gov/science/precision-medicine-activities (TOPMED)
+ #UK10K - http://www.uk10k.org/ (UK10K)
+ #ESP project - http://evs.gs.washington.edu/EVS/ (ESP_)
+ # ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL,
+ #ExAC project http://exac.broadinstitute.org/about (EXAC_)
+ # EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN,
+ # EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN,
+ # EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN,
+ # EXAC_OTHER
+ #gnomAD - http://gnomad.broadinstitute.org/ (GNOMAD_E, GNOMAD_G)
+ frequencySources: [
+ THOUSAND_GENOMES,
+ TOPMED,
+ UK10K,
+
+ ESP_AFRICAN_AMERICAN, ESP_EUROPEAN_AMERICAN, ESP_ALL,
+
+ EXAC_AFRICAN_INC_AFRICAN_AMERICAN, EXAC_AMERICAN,
+ EXAC_SOUTH_ASIAN, EXAC_EAST_ASIAN,
+ EXAC_FINNISH, EXAC_NON_FINNISH_EUROPEAN,
+ EXAC_OTHER,
+
+ GNOMAD_E_AFR,
+ GNOMAD_E_AMR,
+# GNOMAD_E_ASJ,
+ GNOMAD_E_EAS,
+ GNOMAD_E_FIN,
+ GNOMAD_E_NFE,
+ GNOMAD_E_OTH,
+ GNOMAD_E_SAS,
+
+ GNOMAD_G_AFR,
+ GNOMAD_G_AMR,
+# GNOMAD_G_ASJ,
+ GNOMAD_G_EAS,
+ GNOMAD_G_FIN,
+ GNOMAD_G_NFE,
+ GNOMAD_G_OTH,
+ GNOMAD_G_SAS
+ ]
+ #Possible pathogenicitySources: POLYPHEN, MUTATION_TASTER, SIFT, CADD, REMM
+ #REMM is trained on non-coding regulatory regions
+ #*WARNING* if you enable CADD or REMM ensure that you have downloaded and installed the CADD/REMM tabix files
+ #and updated their location in the application.properties. Exomiser will not run without this.
+ pathogenicitySources: [POLYPHEN, MUTATION_TASTER, SIFT]
+ #this is the standard exomiser order.
+ #all steps are optional
+ steps: [
+ #intervalFilter: {interval: 'chr10:123256200-123256300'},
+ # or for multiple intervals:
+ #intervalFilter: {intervals: ['chr10:123256200-123256300', 'chr10:123256290-123256350']},
+ # or using a BED file - NOTE this should be 0-based, Exomiser otherwise uses 1-based coordinates in line with VCF
+ #intervalFilter: {bed: /full/path/to/bed_file.bed},
+ #genePanelFilter: {geneSymbols: ['FGFR1','FGFR2']},
+ #failedVariantFilter: {},
+ #qualityFilter: {minQuality: 50.0},
+ variantEffectFilter: {remove: [UPSTREAM_GENE_VARIANT,
+ INTERGENIC_VARIANT,
+ REGULATORY_REGION_VARIANT,
+ CODING_TRANSCRIPT_INTRON_VARIANT,
+ NON_CODING_TRANSCRIPT_INTRON_VARIANT,
+ SYNONYMOUS_VARIANT,
+ DOWNSTREAM_GENE_VARIANT,
+ SPLICE_REGION_VARIANT]},
+ #knownVariantFilter: {}, #removes variants represented in the database
+ frequencyFilter: {maxFrequency: 2.0},
+ pathogenicityFilter: {keepNonPathogenic: true},
+ #inheritanceFilter and omimPrioritiser should always run AFTER all other filters have completed
+ #they will analyse genes according to the specified modeOfInheritance above- UNDEFINED will not be analysed.
+ inheritanceFilter: {},
+ #omimPrioritiser isn't mandatory.
+ omimPrioritiser: {},
+ #priorityScoreFilter: {minPriorityScore: 0.4},
+ #Other prioritisers: Only combine omimPrioritiser with one of these.
+ #Don't include any if you only want to filter the variants.
+ hiPhivePrioritiser: {},
+ # or run hiPhive in benchmarking mode:
+ #hiPhivePrioritiser: {runParams: 'mouse'},
+ #phivePrioritiser: {}
+ #phenixPrioritiser: {}
+ #exomeWalkerPrioritiser: {seedGeneIds: [11111, 22222, 33333]}
+ ]
+outputOptions:
+ outputPassVariantsOnly: false
+ #numGenes options: 0 = all or specify a limit e.g. 500 for the first 500 results
+ numGenes: 50
+ #outputPrefix options: specify the path/filename without an extension and this will be added
+ # according to the outputFormats option. If unspecified this will default to the following:
+ # {exomiserDir}/results/input-vcf-name-exomiser-results.html
+ # alternatively, specify a fully qualifed path only. e.g. /users/jules/exomes/analysis
+ outputPrefix: output
+ #out-format options: HTML, JSON, TSV_GENE, TSV_VARIANT, VCF (default: HTML)
+ outputFormats: [HTML, JSON, TSV_GENE, TSV_VARIANT, VCF]
\ No newline at end of file
diff --git a/moon_tools/exomiser/pre.yml b/moon_tools/exomiser/pre.yml
new file mode 100644
index 00000000..b3981331
--- /dev/null
+++ b/moon_tools/exomiser/pre.yml
@@ -0,0 +1,11 @@
+## Exomiser Analysis Template for multi-sample VCF files
+# These are all the possible options for running exomiser. Use this as a template for
+# your own set-up.
+# java -Xms2g -Xmx4g -jar exomiser-cli-11.0.0.jar --analysis n18KD_036M0097/analysis_proband.yml
+analysis:
+ # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt.
+ genomeAssembly: hg19
+ vcf: proband.vcf
+ ped:
+ proband:
+ hpoIds: [
\ No newline at end of file
diff --git a/moon_tools/exomiser/trio_pre_female.yml b/moon_tools/exomiser/trio_pre_female.yml
new file mode 100644
index 00000000..88ed0619
--- /dev/null
+++ b/moon_tools/exomiser/trio_pre_female.yml
@@ -0,0 +1,11 @@
+## Exomiser Analysis Template for multi-sample VCF files
+# These are all the possible options for running exomiser. Use this as a template for
+# your own set-up.
+# java -Xms2g -Xmx4g -jar exomiser-cli-11.0.0.jar --analysis n18KD_036M0097/analysis_proband.yml
+analysis:
+ # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt.
+ genomeAssembly: hg19
+ vcf: proband.vcf
+ ped: female.ped
+ proband: proband
+ hpoIds: [
\ No newline at end of file
diff --git a/moon_tools/exomiser/trio_pre_male.yml b/moon_tools/exomiser/trio_pre_male.yml
new file mode 100644
index 00000000..c2bc60c4
--- /dev/null
+++ b/moon_tools/exomiser/trio_pre_male.yml
@@ -0,0 +1,11 @@
+## Exomiser Analysis Template for multi-sample VCF files
+# These are all the possible options for running exomiser. Use this as a template for
+# your own set-up.
+# java -Xms2g -Xmx4g -jar exomiser-cli-11.0.0.jar --analysis n18KD_036M0097/analysis_proband.yml
+analysis:
+ # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt.
+ genomeAssembly: hg19
+ vcf: proband.vcf
+ ped: male.ped
+ proband: proband
+ hpoIds: [
\ No newline at end of file
diff --git a/moon_tools/exomiser/trio_pre_other.yml b/moon_tools/exomiser/trio_pre_other.yml
new file mode 100644
index 00000000..cdf4db5f
--- /dev/null
+++ b/moon_tools/exomiser/trio_pre_other.yml
@@ -0,0 +1,11 @@
+## Exomiser Analysis Template for multi-sample VCF files
+# These are all the possible options for running exomiser. Use this as a template for
+# your own set-up.
+# java -Xms2g -Xmx4g -jar exomiser-cli-11.0.0.jar --analysis n18KD_036M0097/analysis_proband.yml
+analysis:
+ # hg19 or hg38 - ensure that the application has been configured to run the specified assembly otherwise it will halt.
+ genomeAssembly: hg19
+ vcf: proband.vcf
+ ped: other.ped
+ proband: proband
+ hpoIds: [
\ No newline at end of file
diff --git a/moon_tools/gatk4_sortvcf.xml b/moon_tools/gatk4_sortvcf.xml
new file mode 100644
index 00000000..8818fdf3
--- /dev/null
+++ b/moon_tools/gatk4_sortvcf.xml
@@ -0,0 +1,37 @@
+
+ - Sorts one or more VCF files.
+
+ macros.xml
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/moon_tools/gene_filter.py b/moon_tools/gene_filter.py
new file mode 100644
index 00000000..20a7f434
--- /dev/null
+++ b/moon_tools/gene_filter.py
@@ -0,0 +1,193 @@
+""" gene_filter_v2.py
+ Script that takes a text file with a list of genes and a tsv file with CNVs and returns a tsv file with a list of CNVs that contain at least one of the genes.
+ Eleanor Campbell
+"""
+
+import sys
+
+def build_list(file_name):
+ """Build a list of genes from a txt file name with one gene per line"""
+ #open the file
+ file = open(file_name, 'r')
+ #initiate list
+ my_list = []
+ #iterate over each line the file
+ for line in file:
+ #strip off the end line character
+ entry = line.strip().upper()
+ #strip off any extra white space and turn all letters uppercase to make it not case sensitive
+ #entry = entry.strip(' ').upper()
+ #check if the gene is now empty
+ if entry != '':
+ #add the clean uppcase gene to the list
+ my_list.append(entry)
+ #sort the list
+ my_list.sort()
+ #close the file
+ file.close()
+ #return the list
+ return my_list
+
+def new_split(string_list, delimeter):
+ """Split an already split string again by a new delimeter, takes a list (the output of the first split) and a new delimeter"""
+ #Since the genes are also split by semi colon, check each 'gene' to see if it has a semi colon in it
+ for gene in string_list:
+ if delimeter in gene:
+ #if it has a semicolon then remove that string from the list
+ string_list.remove(gene)
+ #split the string by the semicolon
+ new_genes = gene.split(delimeter)
+ #add each of the new splits back into the list
+ for new_gene in new_genes:
+ string_list.append(new_gene)
+
+def gene_in_line(gene_list, line, gene_index):
+ """returns true or false depending on if any of the genes in this line are in the gene list. parameters: gene_list of type list, line of type string, and gene_index, where the gene index is the position of the gene list in the TSV"""
+ #turn the line uppercase to make the search non-case sensitive
+ line_upper = line.upper()
+ #split the line by tab and take the entry at the gene index
+ line_genes = line_upper.split('\t')[gene_index]
+ #split the line by commas to creat a list of genes
+ line_genes_list = line_genes.split(',')
+ #now split on the semicolon
+ new_split(line_genes_list, ';')
+ #check each gene in the line's gene list to see if it is in the given gene list
+ for gene in line_genes_list:
+ #strip down the gene to the essentials
+ gene = gene.strip()
+ #gene = gene.strip(' ')
+ gene = gene.strip('"')
+ #check if the gene is in the gene list
+ if gene in gene_list:
+ #if the gene is in the gene list then simply return true
+ return True
+ #if the each of the genes are check and none force a true return then return False
+ return False
+
+def filter_tsv_by_list(tsv_file_name, gene_list):
+ """Takes a tsv and a list and writes a new tsv with only those tsv lines who include genes from the gene list"""
+ #open the tsv
+ tsv = open(tsv_file_name, 'r')
+ #create the new tsv, with a name based on the old tsv name
+ filtered_tsv_name = 'filtered.tsv'
+ #tsv_file_name.split('.')[0] + '_filtered.tsv'
+ filtered_tsv = open(filtered_tsv_name, 'w')
+ #Get the first line from the old tsv as a header
+ first_line = tsv.readline()
+ #find the index of the GENES column
+ header_array = first_line.split('\t')
+ #initiate the count
+ count = 0
+ #initiate a gene index
+ gene_index = 0
+ #loop over each of the elements in the header to find GENES
+ for header in header_array:
+ #strip off the new line character in case it is there
+ header = header_array[count].strip()
+ #check if the header for this column is GENES
+ if header == 'GENES':
+ #if the header name is GENES then make the gene_index equal to this count
+ gene_index = count
+ #move the count forward
+ count = count + 1
+ #Add the first line from the old tsv to the new tsv as a header
+ filtered_tsv.write(first_line)
+ #Loop over each line in the old tsv
+ for line in tsv:
+ #check if the line has a valid gene in its gene list
+ if gene_in_line(gene_list, line, gene_index):
+ #if so add it to the new tsv
+ filtered_tsv.write(line)
+ #close the files
+ tsv.close()
+ filtered_tsv.close()
+ #return the new file name
+ return filtered_tsv_name
+
+def find_frequency(line, type_index, del_freq_index, dup_freq_index):
+ """Find the frequency of the variant given the variant line in the CNV file, the index of the type of CNV, the index of the deletion frequency and the index of the duplication frequency """
+ #Split the line by tab to get each column alone
+ entry = line.split('\t')
+ #get the entry type with the type index
+ entry_type = entry[type_index]
+ if entry_type == 'DUP':
+ #if the entry is a duplication then get the frequency with the duplication index
+ frequency = entry[dup_freq_index]
+ elif entry_type == 'DEL':
+ #if the entry is a deletion then get the frequency with the deletion index
+ frequency = entry[del_freq_index]
+ else:
+ #if the type is not one the given types then return a frequency of -1 so that it will be called and looked a more closely
+ frequency = -1
+ return frequency
+
+def get_index(header_line, header_name):
+ """gets the index of the given header from a the header line"""
+ #Split the hedaer line into its individual headers
+ entry = header_line.split('\t')
+ #initiate an index count
+ count = 0
+ #loop over each header in the line
+ for header in entry:
+ #if the header matches the desired headername then return the count
+ if header.strip() == header_name:
+ return count
+ else:
+ #increase the count if the header is not a match
+ count = count + 1
+
+def filter_by_freq(tsv_name):
+ """Filter a tsv by frequency and write two tsvs one with a fitler of .05 and one with a fitler of .01"""
+ #open the tsv to be filtered
+ tsv = open(tsv_name, 'r')
+ #create names for the new tsvs by adding _u1 and _u5 to the end of the original tsv name
+ filtered_5_tsv_name = 'u5.tsv'
+ #tsv_name.split('.')[0] + '_u5.tsv'
+ filtered_1_tsv_name = 'u1.tsv'
+ #tsv_name.split('.')[0] + '_u1.tsv'
+ #create the new tsvs
+ tsv_5 = open(filtered_5_tsv_name, 'w')
+ tsv_1 = open(filtered_1_tsv_name, 'w')
+
+ #get the header line from the original and write it to the new tsvs
+ first_line = tsv.readline()
+ tsv_1.write(first_line)
+ tsv_5.write(first_line)
+ #get the relevant indecies from the hedaer line
+ type_index = get_index(first_line, 'DEL/DUP')
+ del_freq_index = get_index(first_line, 'POP DEL AF')
+ dup_freq_index = get_index(first_line, 'POP DUP AF')
+ #loop over the lines in the tsv
+ for line in tsv:
+ #get the frequency
+ frequency = float(find_frequency(line, type_index, del_freq_index, dup_freq_index))
+ #if the frequency is small enough then write it to the right files
+ if frequency <= .01:
+ tsv_1.write(line)
+ tsv_5.write(line)
+ elif frequency <= .05:
+ tsv_5.write(line)
+ #close the files
+ tsv.close()
+ tsv_1.close()
+ tsv_5.close()
+
+
+def main(args):
+ if len(args) == 3:
+ gene_list = build_list(args[2])
+ elif len(args) == 4:
+ list_1 = build_list(args[2])
+ list_2 = build_list(args[3])
+ gene_list = list_1 + list_2
+ else:
+ gene_list = []
+ print("Please provide an appropriate number of arguments: \npython gene_filter.py cnv_file.tsv gene_list.txt \nor \npython gene_filter.py cnv_file.tsv gene_list_1.txt gene_list_2.txt")
+ new_tsv_name = filter_tsv_by_list(args[1], gene_list)
+ filter_by_freq(new_tsv_name)
+
+if __name__ == "__main__":
+ args = sys.argv
+ main(args)
+
+
\ No newline at end of file
diff --git a/moon_tools/gene_filter.xml b/moon_tools/gene_filter.xml
new file mode 100644
index 00000000..51d1775d
--- /dev/null
+++ b/moon_tools/gene_filter.xml
@@ -0,0 +1,19 @@
+
+ Filter a CNV list by a gene list and by frequency.
+
+
+
+
+
+
+
+
+
+
+
+
+ Takes a list of CNVs and filters by a list of genes to return the list of CNVs in relevant genes. Then filters that list by frequency returning a list of those CNVs under 5% and a seperate list of under 1%.
+
+
\ No newline at end of file
diff --git a/moon_tools/macros.xml b/moon_tools/macros.xml
new file mode 100644
index 00000000..32a4c78e
--- /dev/null
+++ b/moon_tools/macros.xml
@@ -0,0 +1,751 @@
+
+
+ 4.0.5.1
+ @VERSION@+galaxy
+
+
+
+ gatk4
+ tabix
+
+
+
+
+
+
+ gatk SortSam --version 2>&1 | grep Version | cut -d ':' -f 2
+
+
+
+ gatk --java-options "-Xmx24g -Xms2g"
+
+
+
+
+
+
+
+
+ --input input.bam
+
+
+
+
+
+
+ --INPUT input.bam
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if str($ival_type.ival_type_sel) == "ival_file"
+ #if $ival_type.intervals
+ #if $ival_type.intervals.is_of_type("gatk_interval")
+ --intervals intervals.interval_list
+ #end if
+ #if $ival_type.intervals.is_of_type("bed")
+ --intervals intervals.bed
+ #end if
+ #if $ival_type.intervals.is_of_type("vcf")
+ --intervals intervals.vcf
+ #end if
+ #end if
+ #else
+ #if $ival_type.intervals
+ --intervals "${ival_type.intervals}"
+ #end if
+ #end if
+ #if $ival_type.interval_padding
+ --interval-padding "${ival_type.interval_padding}"
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if str($excl_ival_type.excl_ival_type_sel) == "excl_ival_file"
+ #if $excl_ival_type.exclude_intervals
+ #if $excl_ival_type.exclude_intervals.is_of_type("gatk_interval")
+ --exclude-intervals excl_intervals.interval_list
+ #end if
+ #if $excl_ival_type.exclude_intervals.is_of_type("bed")
+ --exclude-intervals excl_intervals.bed
+ #end if
+ #if $excl_ival_type.exclude_intervals.is_of_type("vcf")
+ --exclude-intervals excl_intervals.vcf
+ #end if
+ #end if
+ #else
+ #if $excl_ival_type.exclude_intervals
+ --exclude-intervals "${excl_ival_type.exclude_intervals}"
+ #end if
+ #end if
+ #if $excl_ival_type.interval_exclusion_padding
+ --interval-exclusion-padding "${excl_ival_type.interval_exclusion_padding}"
+ #end if
+
+
+
+
+
+
+ > "${log}" &&
+ #else
+ ln -s $file input${num}.vcf &&
+ #end if
+ #end for
+ ]]>
+
+
+
+
+
+
+
+ > "${log}" &&
+ ###elif $file.is_of_type("txt")
+ ###ln -s $file input${num}.list &&
+ #else
+ ln -s $file input${num}.vcf &&
+ #end if
+ #end for
+ ]]>
+
+
+
+
+
+
+ > "${log}" &&
+ #else
+ ln -s $input input.vcf &&
+ #end if
+ ]]>
+
+ > "${log}" &&
+ #else
+ ln -s $variant input.vcf &&
+ gatk IndexFeatureFile -F input.vcf &&
+ #end if
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ --output "${output}"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if $picard_adv.arguments_file
+ --arguments_file ${picard_adv.arguments_file}
+ #end if
+ #if $picard_adv.COMPRESSION_LEVEL
+ --COMPRESSION_LEVEL ${picard_adv.COMPRESSION_LEVEL}
+ #end if
+ #if $picard_adv.GA4GH_CLIENT_SECRETS
+ --GA4GH_CLIENT_SECRETS ${picard_adv.GA4GH_CLIENT_SECRETS}
+ #end if
+ #if $picard_adv.MAX_RECORDS_IN_RAM
+ --MAX_RECORDS_IN_RAM ${picard_adv.MAX_RECORDS_IN_RAM}
+ #end if
+ #if $picard_adv.VALIDATION_STRINGENCY
+ --VALIDATION_STRINGENCY ${picard_adv.VALIDATION_STRINGENCY}
+ #end if
+ #if $picard_adv.VERBOSITY
+ --VERBOSITY ${picard_adv.VERBOSITY}
+ #end if
+ #if $picard_adv.TMP_DIR
+ --TMP_DIR ${picard_adv.TMP_DIR}
+ #end if
+ ${picard_adv.CREATE_MD5_FILE}
+ ${picard_adv.USE_JDK_DEFLATER}
+ ${picard_adv.USE_JDK_INFLATER}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ not gzipped_output
+
+
+ gzipped_output
+
+
+
+
+
+ not gzipped_output
+
+
+ gzipped_output
+
+
+
+
+
+
+ #if $gzipped_output
+ --OUTPUT output.vcf.gz
+ #else
+ --OUTPUT output.vcf
+ #end if
+
+
+
+ #if $gzipped_output
+ --output output.vcf.gz
+ #else
+ --output output.vcf
+ #end if
+
+
+
+
+ picard_adv['CREATE_MD5_FILE']
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if $optional.reference_source.reference_source_selector != "no_ref"
+ #if $optional.reference_source.reference_source_selector != "history"
+ --REFERENCE_SEQUENCE ${optional.reference_source.reference_sequence.fields.path}
+ #else
+ --REFERENCE_SEQUENCE ${optional.reference_source.reference_sequence}
+ #end if
+ #end if
+
+
+
+ #if $reference_source.reference_source_selector != "no_ref"
+ #if $reference_source.reference_source_selector != "history"
+ --REFERENCE_SEQUENCE ${reference_source.reference_sequence.fields.path}
+ #else
+ --REFERENCE_SEQUENCE ${reference_source.reference_sequence}
+ #end if
+ #end if
+
+
+
+ #if $reference_source.reference_source_selector != "no_ref"
+ #if $reference_source.reference_source_selector != "history"
+ --reference ${reference_source.reference_sequence.fields.path}
+ #else
+ --reference ${reference_source.reference_sequence}
+ #end if
+ #end if
+
+
+
+ #if $optional.reference_source.reference_source_selector != "no_ref"
+ #if $optional.reference_source.reference_source_selector != "history"
+ --reference ${optional.reference_source.reference_sequence.fields.path}
+ #else
+ --reference ${optional.reference_source.reference_sequence}
+ #end if
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ #if $optional.seqdict_source.seqdict_source_selector != "no_seq_dict"
+ #if $optional.seqdict_source.seqdict_source_selector != "history"
+ #set seq_dict_loc = ''.join($optional.seqdict_source.seqdict_sequence.fields.path.split('.')[:-1]) + '.dict'
+ --SEQUENCE_DICTIONARY $seq_dict_loc
+ #else
+ --SEQUENCE_DICTIONARY ${optional.seqdict_source.seqdict_sequence}
+ #end if
+ #end if
+
+
+
+ #if $seqdict_source.seqdict_source_selector != "no_seq_dict"
+ #if $seqdict_source.seqdict_source_selector != "history"
+ #set seq_dict_loc = ''.join($seqdict_source.seqdict_sequence.fields.path.split('.')[:-1]) + '.dict'
+ --sequence-dictionary $seq_dict_loc
+ #else
+ --sequence-dictionary ${seqdict_source.seqdict_sequence}
+ #end if
+ #end if
+
+
+
+
+ 10.1101/gr.107524.110
+ 10.1038/ng.806
+ 10.1002/0471250953.bi1110s43
+ 10.1101/201178
+
+
+
+
+
+
+ #for $num, $i in enumerate($known_sites)
+ --known-sites ${i.site}
+ #end for
+
+
+
+
+
+
+
+
+
+
+
+
+
+ --output "${output}"
+
+
+
+
+
+
+
+ 2>> "${log}"
+
+
+
+
+
+
+
+
+
+
+
+
+ > "${log}" &&
+ #else
+ ln -s $file input${num}.vcf &&
+ #end if
+ #end for
+ @CMD_BEGIN@ GenomicsDBImport
+ #for $num, $file in enumerate($db_variants)
+ #if $file.is_of_type("vcf_bgzip")
+ -V input${num}.vcf.gz
+ #else
+ -V input${num}.vcf
+ #end if
+ #end for
+ #include source=$gatk_ints_chth#
+ --genomicsdb-workspace-path db_directory
+ $vcf_or_db.extra_params
+ #include source=$log_opts#
+ &&
+ #elif $vcf_or_db.input_type == "prev_DB":
+ cp $vcf_or_db.genomics_db genomics_db.tar &&
+ tar -xopf genomics_db.tar &&
+ #end if
+ #if $vcf_or_db.input_type == "gvcf":
+ echo $vcf_or_db.variant &&
+ #set datatype = $vcf_or_db.variant.datatype
+ #if $vcf_or_db.variant.is_of_type("vcf_bgzip")
+ ln -s $vcf_or_db.variant input.vcf.gz &&
+ tabix input.vcf.gz &&
+ @CMD_BEGIN@ IndexFeatureFile -F input.vcf.gz 2>> "${log}" &&
+ #else
+ ln -s $vcf_or_db.variant input.vcf &&
+ gatk IndexFeatureFile -F input.vcf &&
+ #end if
+ #end if
+ ]]>
+
+
+
+ #if $vcf_or_db.input_type != "gvcf":
+ -V gendb://db_directory
+ #end if
+ #if $vcf_or_db.input_type == "gvcf":
+ #if $vcf_or_db.variant.is_of_type("vcf_bgzip")
+ --variant input.vcf.gz
+ #else
+ --variant input.vcf
+ #end if
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/moon_tools/mito_merge.sh b/moon_tools/mito_merge.sh
new file mode 100644
index 00000000..71642926
--- /dev/null
+++ b/moon_tools/mito_merge.sh
@@ -0,0 +1,155 @@
+#!/bin/bash
+
+#curl -X GET -d "user_token=iSUQvGmVNSjq834g9fP5" -d "user_email=campbena@ohsu.edu" https://oregon.moon.diploid.com/samples/$1/patient-info > info.txt
+python moon_api.py -id $1 -mode info
+
+HPOS=$( grep 'showterm?id=HP:' info.txt | sed "s/[<][^>]*[>]//g")
+
+COUNT=1
+
+for LINE in $HPOS; do
+ if [[ $LINE == *HP:* ]]; then
+ if [[ $COUNT == 1 ]]; then
+ COUNT=2
+ echo -n "$LINE" > hpo.txt
+ else
+ echo -n ";$LINE" >> hpo.txt
+ fi
+ fi
+done
+
+SELECTEDS=$(grep 'selected="selected"' info.txt)
+
+GENDER="unknown"
+
+for LINE in $SELECTEDS; do
+ if [[ $LINE == *male* ]]; then
+ GENDER="male"
+ fi
+ if [[ $LINE == *female* ]]; then
+ GENDER="female"
+ fi
+done
+
+echo $GENDER
+
+AGE=$(grep 'id="sample_age"' info.txt | cut -d '"' -f 8)
+
+echo $AGE
+
+CONSANG=$(grep 'id="sample_is_consanguinous"' info.txt | cut -d '"' -f 6)
+
+IS_CONSANG="false"
+
+if [[ $CONSANG == 1 ]]; then
+ let IS_CONSANG="true"
+fi
+
+echo $IS_CONSANG
+
+TRIO_CHECK=$(grep 'value="mother"' info.txt)
+
+if [[ ! -z $TRIO_CHECK ]]; then
+ TRIO=$(grep 'selected="selected"' info.txt)
+ ID="nothing"
+ REL="nothing"
+ HEALTH="nothing"
+ NUM=1
+ LAST_LINE="nothing"
+ TWO_LAST="nothing"
+ IFS=$'\n'
+ for LINE in $TRIO; do
+ echo "The line is:" $LINE
+ THREE_LAST=$TWO_LAST
+ echo $THREE_LAST
+ TWO_LAST=$LAST_LINE
+ echo $TWO_LAST
+ LAST_LINE=$LINE
+ echo $LAST
+ if [[ $LINE == *KD* ]]; then
+ ID=$(echo $LINE | cut -d '"' -f 4 )
+ echo $ID
+ elif [[ $LINE == *healthy* ]]; then
+ HEALTH="healthy"
+ echo $HEALTH
+ elif [[ $LINE == *affected* ]]; then
+ HEALTH="affected"
+ echo $HEALTH
+ elif [[ $LINE == *mother* ]]; then
+ REL="mother"
+ echo $REL
+ ID=$(echo $THREE_LAST | cut -d '"' -f 4 )
+ if [[ $NUM == 2 ]]; then
+ echo -n ";$ID:$HEALTH:$REL" >> parents.txt
+ else
+ echo -n "$ID:$HEALTH:$REL" > parents.txt
+ fi
+ let NUM=2
+ elif [[ $LINE == *father* ]]; then
+ REL="father"
+ echo $REL
+ ID=$(echo $THREE_LAST | cut -d '"' -f 4 )
+ if [[ $NUM == 2 ]]; then
+ echo -n ";$ID:$HEALTH:$REL" >> parents.txt
+ else
+ echo -n "$ID:$HEALTH:$REL" > parents.txt
+ fi
+ let NUM=2
+ fi
+ done
+fi
+
+HPO_TERMS=$(less hpo.txt)
+
+PARENTS=$(less parents.txt)
+
+NEW_ID=$1
+
+if [[ ! -z $TRIO_CHECK ]]; then
+echo " curl -F ?user_token=iSUQvGmVNSjq834g9fP5?\ "
+echo " -F ?user_email=campbena@ohsu.edu?\ "
+echo " -F ?snp_vcf_file=@$2?\ "
+echo " -F ?sv_vcf_file=@$3?\ "
+echo " -F ?age=$AGE?\ "
+echo " -F ?gender=$GENDER?\ "
+echo " -F ?is_consanguinous=$IS_CONSANG?\ "
+echo " -F ?hpo_terms=$HPO_TERMS?\ "
+echo " -F ?family_members=$PARENTS?\ "
+echo " https://oregon.moon.diploid.com/samples.json"
+
+python moon_api.py -snp $2 -cnv $3 -mode post -age $AGE -gender $GENDER -consang $IS_CONSANG -hp $HPO_TERMS -family $PARENTS
+ #NEW_ID=$(curl -F "user_token=j4yKzsZivJuCXzoxdUM6"\
+ #-F "user_email=potteram@ohsu.edu"\
+ #-F "snp_vcf_file=@$2"\
+ #-F "sv_vcf_file=@$3"\
+ #-F "age=$AGE"\
+ #-F "gender=$GENDER"\
+ #-F "is_consanguinous=$IS_CONSANG"\
+ #-F "hpo_terms=$HPO_TERMS"\
+ #-F "family_members=$PARENTS"\
+ #https://oregon.moon.diploid.com/samples.json | sed 's/[^0-9]*//g' )
+else
+
+ python moon_api.py -snp $2 -cnv $3 -mode post -age $AGE -gender $GENDER -consang $IS_CONSANG -hp $HPO_TERMS
+
+# NEW_ID=$(curl -F "user_token=j4yKzsZivJuCXzoxdUM6"\
+# -F "user_email=potteram@ohsu.edu"\
+# -F "snp_vcf_file=@$2"\
+# -F "sv_vcf_file=@$3"\
+# -F "age=$AGE"\
+# -F "gender=$GENDER"\
+# -F "is_consanguinous=$IS_CONSANG"\
+# -F "hpo_terms=$HPO_TERMS"\
+# https://oregon.moon.diploid.com/samples.json | sed 's/[^0-9]*//g' )
+
+fi
+
+NEW_ID=$(sed 's/[^0-9]*//g' new_id.txt)
+echo $NEW_ID
+
+python moon_api.py -id $NEW_ID -mode analyse
+
+#curl -F "user_token=iSUQvGmVNSjq834g9fP5" -F "user_email=campbena@ohsu.edu" https://oregon.moon.diploid.com/samples/$NEW_ID/analysis.json
+
+
+
\ No newline at end of file
diff --git a/moon_tools/moon_api.py b/moon_tools/moon_api.py
new file mode 100644
index 00000000..2b940847
--- /dev/null
+++ b/moon_tools/moon_api.py
@@ -0,0 +1,79 @@
+import argparse
+import requests
+
+
+def supply_args():
+ parser = argparse.ArgumentParser(description='')
+ parser.add_argument('-id', help="Moon ID")
+ parser.add_argument('-mode', help='info, post, or analyse')
+ parser.add_argument('-snp', help='SNP VCF file')
+ parser.add_argument('-cnv', help='CNV VCF file')
+ parser.add_argument('-age', help='Patient Age')
+ parser.add_argument('-gender', help='Patient Gender')
+ parser.add_argument('-consang', help='Is the patient Consanguinious?')
+ parser.add_argument('-hp', help="HPO Terms")
+ parser.add_argument('-family', help="Family Members", default="none")
+ args = parser.parse_args()
+ return args
+
+
+
+def get_patient_info(moon_id):
+ parameters = {"user_token": "iSUQvGmVNSjq834g9fP5", "user_email": "campbena@ohsu.edu"}
+
+ patient_url = "https://oregon.moon.diploid.com/samples/" + moon_id + "/patient-info"
+ patient_info = requests.get(patient_url, params = parameters)
+
+
+ info = open("info.txt", 'w')
+
+
+ info.write(patient_info.content)
+ info.close()
+ return "info.txt"
+
+def post_sample(snp, cnv, age, gender, consang, hp, family = "none" ):
+ at_snp = "@/Users/campbena/Documents/galaxy-dev/tools/my_tools/" + snp
+ at_cnv = "@/Users/campbena/Documents/galaxy-dev/tools/my_tools/" + cnv
+ int_age = int(age)
+
+
+ parameters = {"user_token": (None, "iSUQvGmVNSjq834g9fP5"), "user_email": (None, "campbena@ohsu.edu"), "snp_vcf_file": (snp, open(snp, 'rb')), "sv_vcf_file": (cnv, open(cnv, 'rb')), "age": (None, age), "gender": (None, gender), "is_consanguinous": (None, consang), "hpo_terms": (None, hp), "family_members": (None, family)}
+
+
+ if family == "none":
+ parameters.pop("family_members")
+
+ patient_url = "https://oregon.moon.diploid.com/samples.json"
+
+ post_to_moon = requests.post(patient_url, files = parameters)
+ print(post_to_moon.text)
+ print(parameters)
+
+ id_file = open("new_id.txt", 'w')
+
+ id_file.write(post_to_moon.text)
+ id_file.close()
+
+ return post_to_moon.text
+
+def analyse_sample(moon_id):
+ parameters = {"user_token": "iSUQvGmVNSjq834g9fP5", "user_email": "campbena@ohsu.edu"}
+
+ patient_url = "https://oregon.moon.diploid.com/samples/" + moon_id + "/analysis.json"
+
+ analyse_now = requests.post(patient_url, params = parameters)
+
+ return analyse_now.content
+
+def main():
+ args = supply_args()
+ if args.mode == "info":
+ return get_patient_info(args.id)
+ if args.mode == "post":
+ return post_sample(args.snp, args.cnv, args.age, args.gender, args.consang, args.hp, args.family)
+ if args.mode == "analyse":
+ return analyse_sample(args.id)
+
+if __name__ == "__main__":
+ main()
diff --git a/moon_tools/moon_cnv_formater.py b/moon_tools/moon_cnv_formater.py
new file mode 100644
index 00000000..97065a71
--- /dev/null
+++ b/moon_tools/moon_cnv_formater.py
@@ -0,0 +1,66 @@
+"""
+ Takes a CNV VCF that is formatted with and SNVTYPE=CNV and turns it into a vcf that Moon can read.
+"""
+
+
+import sys
+
+
+def main(vcf_file, new_vcf_file):
+ #open the vcf file
+ vcf = open(vcf_file, 'r')
+ #open the new file to write to
+ new_vcf = open(new_vcf_file, 'w')
+ #loop over each of the records in the vcf
+ for line in vcf:
+ #if its a header line just write it back to the new vcf
+ if line.startswith('#'):
+ new_vcf.write(line)
+ else:
+ #Split the line into workable entries
+ line_array = line.split('\t')
+ #Get the sample entry
+ sample_entry = line_array[9]
+ #Get the type of the entry from the first character of the sample entry
+ if sample_entry.startswith('1'):
+ CNV_type = 'DEL'
+ elif sample_entry.startswith('2'):
+ CNV_type = 'DUP'
+ else:
+ CNV_type = 'NONE'
+ #Change the ALT
+ line_array[4] = '<' + CNV_type + '>'
+
+ #get the info entry
+ info = line_array[7]
+ #get the svtype entry
+ info_array = info.split(';')
+ #change the svtype entry
+ info_array[6] = 'SVTYPE=' + CNV_type
+ #change the info entry
+ corrected_info = ''
+ first = True
+ for entry in info_array:
+ if first:
+ corrected_info = entry
+ first = False
+ else:
+ corrected_info += ';' + entry
+ #add the corrected info back into the line_array
+ line_array[7] = corrected_info
+
+ #write the fixed line array to the new vcf
+ first = True
+ for entry in line_array:
+ if first:
+ new_vcf.write(entry)
+ first = False
+ else:
+ new_vcf.write('\t' + entry)
+ vcf.close()
+ new_vcf.close()
+
+
+if __name__ == "__main__":
+ args = sys.argv
+ main(args[1], args[2])
\ No newline at end of file
diff --git a/moon_tools/moon_cnv_formater.xml b/moon_tools/moon_cnv_formater.xml
new file mode 100644
index 00000000..7f0ac2b0
--- /dev/null
+++ b/moon_tools/moon_cnv_formater.xml
@@ -0,0 +1,14 @@
+
+ reformats a CNV vcf generated by XHMM so it can be read by Moon.
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/moon_tools/moon_filter.xml b/moon_tools/moon_filter.xml
new file mode 100644
index 00000000..0f623d0b
--- /dev/null
+++ b/moon_tools/moon_filter.xml
@@ -0,0 +1,57 @@
+
+
+ filters and populates the All Rare Variants file from Moon.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ filter_on
+
+
+ gene_list
+
+
+ data_files['omim_data'] and data_files['orph_conv'] and data_files['orph_prev']
+
+
+ Used to filter and populate an All Rare Variants file from Moon.
+ Can also be used to do any of it's componenet parts alone or in pairs, simply leave off any unnecessary files.
+
+ POPULATER: Leave off the gene list file and turn OFF the filter. Can be used on non-Moon files. Specify the header of the gene column.
+
+ GENE LIST FILTER: Do not include the data files and turn OFF the filter. Can be used on non-Moon files. Specify the header on the gene column.
+
+ FILTER ONLY: Add only the file to be filtered, do not include the gene list file or the data files. Formatted for Moon-generated files only.
+
+
+
+
\ No newline at end of file
diff --git a/moon_tools/moon_reposter.xml b/moon_tools/moon_reposter.xml
new file mode 100644
index 00000000..884bd853
--- /dev/null
+++ b/moon_tools/moon_reposter.xml
@@ -0,0 +1,19 @@
+
+ Creates a new Moon sample based on an old Moon sample with a new VCF.
+ $log
+ ]]>
+
+
+
+
+
+
+
+
+
+
+
diff --git a/moon_tools/populator.xml b/moon_tools/populator.xml
new file mode 100644
index 00000000..4e1b7a0f
--- /dev/null
+++ b/moon_tools/populator.xml
@@ -0,0 +1,30 @@
+
+
+ populates a variant file with inheritance and prevelance.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Populates a file with OMIM inheritance, OMIM disorder, Orphanet disorder and Orphanet prevelance.
+
+
+
\ No newline at end of file