Add conversion of deCODEme data

Also includes server/utils/autozip.py support for multi-file ZIP archives, if one specifies a filename to open in the archive. This is because deCODEme data is provided to users as a ZIP archive with three files: 'deCODEme_info.txt', 'deCODEme_scan.csv', and 'deCODEme_sign.txt'. The second file contains the genotyping data. The converter also supports receiving the deCODEme_scan.csv file directly, rather than the whole archive -- this option is supported because the deCODEme_info.txt contains the customer's username, which they might not want to associated with the rest of the data. Genome build is assumed to be build 36. Build information is contained within deCODEme_info.txt, but that file is not examined, or is not available if processing the deCODEme_scan.csv file directly. As of 8/2012, we haven't seen build 37 data from deCODEme.
madprime · Aug 29, 2012 · 5d93119 · 5d93119
1 parent 29c4204
commit 5d93119
Show file tree

Hide file tree

Showing 5 changed files with 169 additions and 40 deletions.
diff --git a/server/conversion/convert_to_gff.py b/server/conversion/convert_to_gff.py
@@ -4,7 +4,7 @@
 import os
 import sys
 from optparse import OptionParser
-import detect_format, cgivar_to_gff, gff_from_23andme, vcf_to_gff
+import detect_format, cgivar_to_gff, gff_from_23andme, gff_from_decodeme, vcf_to_gff
 
 # A bit of path manipulation to import autozip.py from ../utils/
 GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -24,6 +24,8 @@ def convert(input_file, options=None):
         input_data = gff_from_23andme.convert(input_file)
     elif input_type == 'VCF':
         input_data = vcf_to_gff.convert(input_file, options)
+    elif input_type == 'deCODEme':
+        input_data = gff_from_decodeme.convert(input_file)
     else:
         raise Exception("input format not recognized")
 

diff --git a/server/conversion/detect_format.py b/server/conversion/detect_format.py
@@ -1,6 +1,7 @@
 #!/usr/bin/python
 # Filename: detect_format.py
 
+import csv
 import re
 import os
 import sys
@@ -25,15 +26,24 @@ def detect_format(file_input):
       GFF: General Feature Format
       VCF: Variant Call Format (only tested for 23andme exome data)
     """
+    looks_like = dict()
     if isinstance(file_input, str):
-        f_in = autozip.file_open(file_input, 'r')
+        try:
+            f_in = autozip.file_open(file_input, 'r')
+        except AssertionError:
+            f_in = autozip.file_open(file_input, 'r', 'deCODEme_scan.csv')
+            print "deCODEme archive (deCODEme) detected"
+            looks_like['deCODEme'] = True
     else:
         f_in = file_input
 
-    looks_like = dict()
     line_count = 0
     for line in f_in:
         line_count += 1
+        if any([looks_like[x] for x in looks_like.keys()]):
+            break    
+        if line_count > MAX_LINES_CHECKED:
+            break
 
         # Check comment lines, if they exist, for information on file type.
         if re.match('#', line):
@@ -51,41 +61,44 @@ def detect_format(file_input):
                 looks_like['VCF'] = True
 
         # Look at other lines and decide based on their format.
-        data = line.split('\t')
-
-        if ( len(data) > 3 and
-             re.match(r'rs', data[0]) and 
-             re.match(r'[0-9]', data[2]) and
-             re.match(r'[ACGT][ACGT]', data[3]) ):
+        tsv_data = line.split('\t')
+        csv_data = list(csv.reader([line]))
+
+        if ( len(csv_data) > 5 and
+             re.match(r'rs', csv_data[0]) and
+             re.match(r'[ACGT]', csv_data[1]) and
+             re.match(r'[0-9]', csv_data[3]) and
+             re.match(r'[+-]', csv_data[4]) and
+             re.match(r'[ACGT]', csv_data[5]) ):
+            print "deCODEme microarray genotyping data (deCODEme) guessed"
+            looks_like['deCODEme'] = True
+        if ( len(tsv_data) > 3 and
+             re.match(r'rs', tsv_data[0]) and 
+             re.match(r'[0-9]', tsv_data[2]) and
+             re.match(r'[ACGT][ACGT]', tsv_data[3]) ):
             print "23andme microarray genotyping data (23andme) guessed"
             looks_like['23andme'] = True
-        if ( len(data) > 6 and
-             re.match(r'chr', data[3]) and 
-             re.match(r'[0-9]', data[4]) and 
-             re.match(r'[0-9]', data[5]) and
-             (data[6] == "no-call" or data[6] == "ref") ):
+        if ( len(tsv_data) > 6 and
+             re.match(r'chr', tsv_data[3]) and 
+             re.match(r'[0-9]', tsv_data[4]) and 
+             re.match(r'[0-9]', tsv_data[5]) and
+             (tsv_data[6] == "no-call" or tsv_data[6] == "ref") ):
             print "Complete Genomics var file format (CGIvar) guessed"
             looks_like['CGIvar'] = True
-        if ( len(data) > 6 and
-             re.match(r'[0-9]', data[3]) and
-             re.match(r'[0-9]', data[4]) and
-             data[6] == "+" ):
+        if ( len(tsv_data) > 6 and
+             re.match(r'[0-9]', tsv_data[3]) and
+             re.match(r'[0-9]', tsv_data[4]) and
+             tsv_data[6] == "+" ):
             print "General Feature Format (GFF) guessed"
             looks_like['GFF'] = True
-        if ( len(data) > 7 and
-             re.match(r'[0-9]', data[1]) and
-             re.match(r'[ACGT]', data[3]) and
-             re.match(r'[ACGT]', data[4]) and
-             len(data[7].split(';')) > 2 ):
+        if ( len(tsv_data) > 7 and
+             re.match(r'[0-9]', tsv_data[1]) and
+             re.match(r'[ACGT]', tsv_data[3]) and
+             re.match(r'[ACGT]', tsv_data[4]) and
+             len(tsv_data[7].split(';')) > 2 ):
             print "Variant Call Format (VCF) guessed"
             looks_like['VCF'] = True
 
-        if any([looks_like[x] for x in looks_like.keys()]):
-            break
-
-        if line_count > MAX_LINES_CHECKED:
-            break
-
     if isinstance(file_input, str):
         f_in.close()
 

diff --git a/server/conversion/gff_from_decodeme.py b/server/conversion/gff_from_decodeme.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python
+# Filename: gff_from_decodeme.py
+"""Conversion of deCODEme data to GFF for genome processing
+
+The files should be interpretable by GET-Evidence's genome processing system.                                                                                 
+To see command line usage, run with "-h" or "--help".
+"""
+
+import os
+import re
+import sys
+import csv
+from optparse import OptionParser
+
+GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
+if not GETEV_MAIN_PATH in sys.path:
+    sys.path.insert(1, GETEV_MAIN_PATH)
+del GETEV_MAIN_PATH
+
+from utils import autozip
+
+DEFAULT_BUILD = "b36"
+
+def revcomp(sequence):
+    comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
+    output = [comp[x] for x in list(sequence)]
+    output.reverse()
+    return ''.join(output)
+
+def convert(genotype_input):
+    """Take in deCODEme genotype data, yield GFF formatted lines"""
+    genotype_data = genotype_input
+    if isinstance(genotype_input, str):
+        genotype_data = csv.reader(autozip.file_open(genotype_input, 'r', 
+                                                     'deCODEme_scan.csv'))
+    else:
+        genotype_data = csv.reader(genotype_input)
+
+    # We are allowing people to donate only the 'deCODEme_scan.csv' file, 
+    # which unfortunately lacks build information (it is stored separately 
+    # in 'deCODEme_info.txt', but this file also contains the deCODEme 
+    # username). So fare deCODEme files have only been build 36, and so 
+    # this is the current assumption for data processing.
+    build = "b36"
+    yield "##genome-build " + build
+
+    header_row = genotype_data.next()
+    col = dict()
+    for i in range(len(header_row)):
+        col[header_row[i]] = i
+
+    for row in genotype_data:
+        variants = list(row[col['YourCode']])
+        if variants[0] == '-':
+            continue
+        chromosome = 'chr' + row[col['Chromosome']]
+        strand = row[col['Strand']]
+        if strand == '-':
+            variants = [revcomp(x) for x in variants]
+        pos_start = row[col['Position']]
+        pos_end = pos_start
+
+        attributes = ''
+        if variants[0] == variants[1]:
+            attributes = 'alleles ' + variants[0]
+        else:
+            attributes = 'alleles ' + variants[0] + '/' + variants[1]
+        if re.match('rs', row[col['Name']]):
+            attributes = attributes + '; db_xref dbsnp:' + row[col['Name']]
+
+        output = [chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+', 
+                  '.', attributes]
+        yield "\t".join(output)
+
+def convert_to_file(genotype_input, output_file):
+    """Convert a deCODEme file and output GFF-formatted data to file"""
+    output = output_file  # default assumes writable file object
+    if isinstance(output_file, str):
+        output = autozip.file_open(output_file, 'w')
+    conversion = convert(genotype_input)
+    for line in conversion:
+        output.write(line + "\n")
+    output.close()
+
+def main():
+    # Parse options
+    usage = ("\n%prog -i inputfile [-o outputfile]\n"
+             "%prog [-o outputfile] < inputfile")
+    parser = OptionParser(usage=usage)
+    parser.add_option("-i", "--input", dest="inputfile",
+                      help="read deCODEme data from INFILE (automatically "
+                      "uncompress if *.zip, *.gz, *.bz2)", metavar="INFILE")
+    parser.add_option("-o", "--output", dest="outputfile",
+                      help="write report to OUTFILE (automatically compress "
+                      "if *.gz, or *.bz2)", metavar="OUTFILE")
+    options, args = parser.parse_args()
+
+    # Handle input
+    if sys.stdin.isatty():  # false if data is piped in
+        var_input = options.inputfile
+    else:
+        var_input = sys.stdin
+
+    # Handle output
+    if options.outputfile:
+        convert_to_file(var_input, options.outputfile)
+    else:
+        for line in convert(var_input):
+            print line
+
+
+if __name__ == "__main__":
+    main()
diff --git a/server/genome_analyzer.py b/server/genome_analyzer.py
@@ -45,13 +45,11 @@ def process_source(genome_in, metadata=dict(), options=None):
     """
     Open source and return as sorted GFF data.
     """
-    # Handle genome compression, get input, and make best guess of format type.
-    source_input = autozip.file_open(genome_in, 'r')
-    metadata['input_type'] = detect_format.detect_format(source_input)
+    # Make best guess of format type, to be saved in metadata.
+    metadata['input_type'] = detect_format.detect_format(genome_in)
     print >> sys.stderr, "file format:", metadata['input_type']
 
-    # Reset input and reopen, converting to GFF if necessary.
-    source_input.close()
+    # Open genetic data, decompressing and converting to GFF if necessary.
     gff_input = convert_to_gff.convert(genome_in, options)
 
     # Grab header (don't sort) & genome build. Pipe the rest to UNIX sort.

diff --git a/server/utils/autozip.py b/server/utils/autozip.py
@@ -4,7 +4,7 @@
 import os
 import bz2
 
-def file_open(filename, mode='r'):
+def file_open(filename, mode='r', arch_file=''):
     """Return file obj, with compression if appropriate extension is given
 
     Arguments:
@@ -19,15 +19,18 @@ def file_open(filename, mode='r'):
     # ZIP compression actions.
     if re.search("\.zip", filename):
         archive = zipfile.ZipFile(filename, mode)
+        if not hasattr(archive, 'open'):
+            raise ImportError ('zipfile.ZipFile.open not available. '
+                               'Upgrade python to 2.6 (or later) to ' +
+                               'work with zip-compressed files!')
         if mode == 'r':
             files = archive.infolist()
-            assert len(files) == 1
-            if hasattr(archive, "open"):
+            if not arch_file:
+                assert len(files) == 1, \
+                    'More than one file in ZIP archive, no filename provided.'
                 return archive.open(files[0])
             else:
-                raise ImportError ('zipfile.ZipFile.open not available. '
-                                   'Upgrade python to 2.6 (or later) to ' +
-                                   'work with zip-compressed files!')
+                return archive.open(arch_file)
         else:
             raise TypeError ('Zip archive only supported for reading.')