diff --git a/server/conversion/convert_to_gff.py b/server/conversion/convert_to_gff.py index 634b6be..e852635 100644 --- a/server/conversion/convert_to_gff.py +++ b/server/conversion/convert_to_gff.py @@ -4,7 +4,7 @@ import os import sys from optparse import OptionParser -import detect_format, cgivar_to_gff, gff_from_23andme, vcf_to_gff +import detect_format, cgivar_to_gff, gff_from_23andme, gff_from_decodeme, vcf_to_gff # A bit of path manipulation to import autozip.py from ../utils/ GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) @@ -24,6 +24,8 @@ def convert(input_file, options=None): input_data = gff_from_23andme.convert(input_file) elif input_type == 'VCF': input_data = vcf_to_gff.convert(input_file, options) + elif input_type == 'deCODEme': + input_data = gff_from_decodeme.convert(input_file) else: raise Exception("input format not recognized") diff --git a/server/conversion/detect_format.py b/server/conversion/detect_format.py index 22ebb76..72f9fec 100644 --- a/server/conversion/detect_format.py +++ b/server/conversion/detect_format.py @@ -1,6 +1,7 @@ #!/usr/bin/python # Filename: detect_format.py +import csv import re import os import sys @@ -25,15 +26,24 @@ def detect_format(file_input): GFF: General Feature Format VCF: Variant Call Format (only tested for 23andme exome data) """ + looks_like = dict() if isinstance(file_input, str): - f_in = autozip.file_open(file_input, 'r') + try: + f_in = autozip.file_open(file_input, 'r') + except AssertionError: + f_in = autozip.file_open(file_input, 'r', 'deCODEme_scan.csv') + print "deCODEme archive (deCODEme) detected" + looks_like['deCODEme'] = True else: f_in = file_input - looks_like = dict() line_count = 0 for line in f_in: line_count += 1 + if any([looks_like[x] for x in looks_like.keys()]): + break + if line_count > MAX_LINES_CHECKED: + break # Check comment lines, if they exist, for information on file type. if re.match('#', line): @@ -51,41 +61,44 @@ def detect_format(file_input): looks_like['VCF'] = True # Look at other lines and decide based on their format. - data = line.split('\t') - - if ( len(data) > 3 and - re.match(r'rs', data[0]) and - re.match(r'[0-9]', data[2]) and - re.match(r'[ACGT][ACGT]', data[3]) ): + tsv_data = line.split('\t') + csv_data = list(csv.reader([line])) + + if ( len(csv_data) > 5 and + re.match(r'rs', csv_data[0]) and + re.match(r'[ACGT]', csv_data[1]) and + re.match(r'[0-9]', csv_data[3]) and + re.match(r'[+-]', csv_data[4]) and + re.match(r'[ACGT]', csv_data[5]) ): + print "deCODEme microarray genotyping data (deCODEme) guessed" + looks_like['deCODEme'] = True + if ( len(tsv_data) > 3 and + re.match(r'rs', tsv_data[0]) and + re.match(r'[0-9]', tsv_data[2]) and + re.match(r'[ACGT][ACGT]', tsv_data[3]) ): print "23andme microarray genotyping data (23andme) guessed" looks_like['23andme'] = True - if ( len(data) > 6 and - re.match(r'chr', data[3]) and - re.match(r'[0-9]', data[4]) and - re.match(r'[0-9]', data[5]) and - (data[6] == "no-call" or data[6] == "ref") ): + if ( len(tsv_data) > 6 and + re.match(r'chr', tsv_data[3]) and + re.match(r'[0-9]', tsv_data[4]) and + re.match(r'[0-9]', tsv_data[5]) and + (tsv_data[6] == "no-call" or tsv_data[6] == "ref") ): print "Complete Genomics var file format (CGIvar) guessed" looks_like['CGIvar'] = True - if ( len(data) > 6 and - re.match(r'[0-9]', data[3]) and - re.match(r'[0-9]', data[4]) and - data[6] == "+" ): + if ( len(tsv_data) > 6 and + re.match(r'[0-9]', tsv_data[3]) and + re.match(r'[0-9]', tsv_data[4]) and + tsv_data[6] == "+" ): print "General Feature Format (GFF) guessed" looks_like['GFF'] = True - if ( len(data) > 7 and - re.match(r'[0-9]', data[1]) and - re.match(r'[ACGT]', data[3]) and - re.match(r'[ACGT]', data[4]) and - len(data[7].split(';')) > 2 ): + if ( len(tsv_data) > 7 and + re.match(r'[0-9]', tsv_data[1]) and + re.match(r'[ACGT]', tsv_data[3]) and + re.match(r'[ACGT]', tsv_data[4]) and + len(tsv_data[7].split(';')) > 2 ): print "Variant Call Format (VCF) guessed" looks_like['VCF'] = True - if any([looks_like[x] for x in looks_like.keys()]): - break - - if line_count > MAX_LINES_CHECKED: - break - if isinstance(file_input, str): f_in.close() diff --git a/server/conversion/gff_from_decodeme.py b/server/conversion/gff_from_decodeme.py new file mode 100644 index 0000000..04dd62f --- /dev/null +++ b/server/conversion/gff_from_decodeme.py @@ -0,0 +1,113 @@ +#!/usr/bin/python +# Filename: gff_from_decodeme.py +"""Conversion of deCODEme data to GFF for genome processing + +The files should be interpretable by GET-Evidence's genome processing system. +To see command line usage, run with "-h" or "--help". +""" + +import os +import re +import sys +import csv +from optparse import OptionParser + +GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) +if not GETEV_MAIN_PATH in sys.path: + sys.path.insert(1, GETEV_MAIN_PATH) +del GETEV_MAIN_PATH + +from utils import autozip + +DEFAULT_BUILD = "b36" + +def revcomp(sequence): + comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + output = [comp[x] for x in list(sequence)] + output.reverse() + return ''.join(output) + +def convert(genotype_input): + """Take in deCODEme genotype data, yield GFF formatted lines""" + genotype_data = genotype_input + if isinstance(genotype_input, str): + genotype_data = csv.reader(autozip.file_open(genotype_input, 'r', + 'deCODEme_scan.csv')) + else: + genotype_data = csv.reader(genotype_input) + + # We are allowing people to donate only the 'deCODEme_scan.csv' file, + # which unfortunately lacks build information (it is stored separately + # in 'deCODEme_info.txt', but this file also contains the deCODEme + # username). So fare deCODEme files have only been build 36, and so + # this is the current assumption for data processing. + build = "b36" + yield "##genome-build " + build + + header_row = genotype_data.next() + col = dict() + for i in range(len(header_row)): + col[header_row[i]] = i + + for row in genotype_data: + variants = list(row[col['YourCode']]) + if variants[0] == '-': + continue + chromosome = 'chr' + row[col['Chromosome']] + strand = row[col['Strand']] + if strand == '-': + variants = [revcomp(x) for x in variants] + pos_start = row[col['Position']] + pos_end = pos_start + + attributes = '' + if variants[0] == variants[1]: + attributes = 'alleles ' + variants[0] + else: + attributes = 'alleles ' + variants[0] + '/' + variants[1] + if re.match('rs', row[col['Name']]): + attributes = attributes + '; db_xref dbsnp:' + row[col['Name']] + + output = [chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+', + '.', attributes] + yield "\t".join(output) + +def convert_to_file(genotype_input, output_file): + """Convert a deCODEme file and output GFF-formatted data to file""" + output = output_file # default assumes writable file object + if isinstance(output_file, str): + output = autozip.file_open(output_file, 'w') + conversion = convert(genotype_input) + for line in conversion: + output.write(line + "\n") + output.close() + +def main(): + # Parse options + usage = ("\n%prog -i inputfile [-o outputfile]\n" + "%prog [-o outputfile] < inputfile") + parser = OptionParser(usage=usage) + parser.add_option("-i", "--input", dest="inputfile", + help="read deCODEme data from INFILE (automatically " + "uncompress if *.zip, *.gz, *.bz2)", metavar="INFILE") + parser.add_option("-o", "--output", dest="outputfile", + help="write report to OUTFILE (automatically compress " + "if *.gz, or *.bz2)", metavar="OUTFILE") + options, args = parser.parse_args() + + # Handle input + if sys.stdin.isatty(): # false if data is piped in + var_input = options.inputfile + else: + var_input = sys.stdin + + # Handle output + if options.outputfile: + convert_to_file(var_input, options.outputfile) + else: + for line in convert(var_input): + print line + + +if __name__ == "__main__": + main() diff --git a/server/genome_analyzer.py b/server/genome_analyzer.py index 37ac284..116a228 100755 --- a/server/genome_analyzer.py +++ b/server/genome_analyzer.py @@ -45,13 +45,11 @@ def process_source(genome_in, metadata=dict(), options=None): """ Open source and return as sorted GFF data. """ - # Handle genome compression, get input, and make best guess of format type. - source_input = autozip.file_open(genome_in, 'r') - metadata['input_type'] = detect_format.detect_format(source_input) + # Make best guess of format type, to be saved in metadata. + metadata['input_type'] = detect_format.detect_format(genome_in) print >> sys.stderr, "file format:", metadata['input_type'] - # Reset input and reopen, converting to GFF if necessary. - source_input.close() + # Open genetic data, decompressing and converting to GFF if necessary. gff_input = convert_to_gff.convert(genome_in, options) # Grab header (don't sort) & genome build. Pipe the rest to UNIX sort. diff --git a/server/utils/autozip.py b/server/utils/autozip.py index e6f8c1c..38c2708 100644 --- a/server/utils/autozip.py +++ b/server/utils/autozip.py @@ -4,7 +4,7 @@ import os import bz2 -def file_open(filename, mode='r'): +def file_open(filename, mode='r', arch_file=''): """Return file obj, with compression if appropriate extension is given Arguments: @@ -19,15 +19,18 @@ def file_open(filename, mode='r'): # ZIP compression actions. if re.search("\.zip", filename): archive = zipfile.ZipFile(filename, mode) + if not hasattr(archive, 'open'): + raise ImportError ('zipfile.ZipFile.open not available. ' + 'Upgrade python to 2.6 (or later) to ' + + 'work with zip-compressed files!') if mode == 'r': files = archive.infolist() - assert len(files) == 1 - if hasattr(archive, "open"): + if not arch_file: + assert len(files) == 1, \ + 'More than one file in ZIP archive, no filename provided.' return archive.open(files[0]) else: - raise ImportError ('zipfile.ZipFile.open not available. ' - 'Upgrade python to 2.6 (or later) to ' + - 'work with zip-compressed files!') + return archive.open(arch_file) else: raise TypeError ('Zip archive only supported for reading.')