Skip to content

Commit

Permalink
Add conversion of deCODEme data
Browse files Browse the repository at this point in the history
Also includes server/utils/autozip.py support for multi-file ZIP
archives, if one specifies a filename to open in the archive.

This is because deCODEme data is provided to users as a ZIP archive
with three files: 'deCODEme_info.txt', 'deCODEme_scan.csv', and
'deCODEme_sign.txt'. The second file contains the genotyping data.

The converter also supports receiving the deCODEme_scan.csv file
directly, rather than the whole archive -- this option is supported
because the deCODEme_info.txt contains the customer's username,
which they might not want to associated with the rest of the data.

Genome build is assumed to be build 36. Build information is
contained within deCODEme_info.txt, but that file is not examined,
or is not available if processing the deCODEme_scan.csv file directly.
As of 8/2012, we haven't seen build 37 data from deCODEme.
  • Loading branch information
Madeleine Price Ball committed Aug 29, 2012
1 parent 29c4204 commit 5d93119
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 40 deletions.
4 changes: 3 additions & 1 deletion server/conversion/convert_to_gff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import sys
from optparse import OptionParser
import detect_format, cgivar_to_gff, gff_from_23andme, vcf_to_gff
import detect_format, cgivar_to_gff, gff_from_23andme, gff_from_decodeme, vcf_to_gff

# A bit of path manipulation to import autozip.py from ../utils/
GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
Expand All @@ -24,6 +24,8 @@ def convert(input_file, options=None):
input_data = gff_from_23andme.convert(input_file)
elif input_type == 'VCF':
input_data = vcf_to_gff.convert(input_file, options)
elif input_type == 'deCODEme':
input_data = gff_from_decodeme.convert(input_file)
else:
raise Exception("input format not recognized")

Expand Down
69 changes: 41 additions & 28 deletions server/conversion/detect_format.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/usr/bin/python
# Filename: detect_format.py

import csv
import re
import os
import sys
Expand All @@ -25,15 +26,24 @@ def detect_format(file_input):
GFF: General Feature Format
VCF: Variant Call Format (only tested for 23andme exome data)
"""
looks_like = dict()
if isinstance(file_input, str):
f_in = autozip.file_open(file_input, 'r')
try:
f_in = autozip.file_open(file_input, 'r')
except AssertionError:
f_in = autozip.file_open(file_input, 'r', 'deCODEme_scan.csv')
print "deCODEme archive (deCODEme) detected"
looks_like['deCODEme'] = True
else:
f_in = file_input

looks_like = dict()
line_count = 0
for line in f_in:
line_count += 1
if any([looks_like[x] for x in looks_like.keys()]):
break
if line_count > MAX_LINES_CHECKED:
break

# Check comment lines, if they exist, for information on file type.
if re.match('#', line):
Expand All @@ -51,41 +61,44 @@ def detect_format(file_input):
looks_like['VCF'] = True

# Look at other lines and decide based on their format.
data = line.split('\t')

if ( len(data) > 3 and
re.match(r'rs', data[0]) and
re.match(r'[0-9]', data[2]) and
re.match(r'[ACGT][ACGT]', data[3]) ):
tsv_data = line.split('\t')
csv_data = list(csv.reader([line]))

if ( len(csv_data) > 5 and
re.match(r'rs', csv_data[0]) and
re.match(r'[ACGT]', csv_data[1]) and
re.match(r'[0-9]', csv_data[3]) and
re.match(r'[+-]', csv_data[4]) and
re.match(r'[ACGT]', csv_data[5]) ):
print "deCODEme microarray genotyping data (deCODEme) guessed"
looks_like['deCODEme'] = True
if ( len(tsv_data) > 3 and
re.match(r'rs', tsv_data[0]) and
re.match(r'[0-9]', tsv_data[2]) and
re.match(r'[ACGT][ACGT]', tsv_data[3]) ):
print "23andme microarray genotyping data (23andme) guessed"
looks_like['23andme'] = True
if ( len(data) > 6 and
re.match(r'chr', data[3]) and
re.match(r'[0-9]', data[4]) and
re.match(r'[0-9]', data[5]) and
(data[6] == "no-call" or data[6] == "ref") ):
if ( len(tsv_data) > 6 and
re.match(r'chr', tsv_data[3]) and
re.match(r'[0-9]', tsv_data[4]) and
re.match(r'[0-9]', tsv_data[5]) and
(tsv_data[6] == "no-call" or tsv_data[6] == "ref") ):
print "Complete Genomics var file format (CGIvar) guessed"
looks_like['CGIvar'] = True
if ( len(data) > 6 and
re.match(r'[0-9]', data[3]) and
re.match(r'[0-9]', data[4]) and
data[6] == "+" ):
if ( len(tsv_data) > 6 and
re.match(r'[0-9]', tsv_data[3]) and
re.match(r'[0-9]', tsv_data[4]) and
tsv_data[6] == "+" ):
print "General Feature Format (GFF) guessed"
looks_like['GFF'] = True
if ( len(data) > 7 and
re.match(r'[0-9]', data[1]) and
re.match(r'[ACGT]', data[3]) and
re.match(r'[ACGT]', data[4]) and
len(data[7].split(';')) > 2 ):
if ( len(tsv_data) > 7 and
re.match(r'[0-9]', tsv_data[1]) and
re.match(r'[ACGT]', tsv_data[3]) and
re.match(r'[ACGT]', tsv_data[4]) and
len(tsv_data[7].split(';')) > 2 ):
print "Variant Call Format (VCF) guessed"
looks_like['VCF'] = True

if any([looks_like[x] for x in looks_like.keys()]):
break

if line_count > MAX_LINES_CHECKED:
break

if isinstance(file_input, str):
f_in.close()

Expand Down
113 changes: 113 additions & 0 deletions server/conversion/gff_from_decodeme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/python
# Filename: gff_from_decodeme.py
"""Conversion of deCODEme data to GFF for genome processing
The files should be interpretable by GET-Evidence's genome processing system.
To see command line usage, run with "-h" or "--help".
"""

import os
import re
import sys
import csv
from optparse import OptionParser

GETEV_MAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
if not GETEV_MAIN_PATH in sys.path:
sys.path.insert(1, GETEV_MAIN_PATH)
del GETEV_MAIN_PATH

from utils import autozip

DEFAULT_BUILD = "b36"

def revcomp(sequence):
comp = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
output = [comp[x] for x in list(sequence)]
output.reverse()
return ''.join(output)

def convert(genotype_input):
"""Take in deCODEme genotype data, yield GFF formatted lines"""
genotype_data = genotype_input
if isinstance(genotype_input, str):
genotype_data = csv.reader(autozip.file_open(genotype_input, 'r',
'deCODEme_scan.csv'))
else:
genotype_data = csv.reader(genotype_input)

# We are allowing people to donate only the 'deCODEme_scan.csv' file,
# which unfortunately lacks build information (it is stored separately
# in 'deCODEme_info.txt', but this file also contains the deCODEme
# username). So fare deCODEme files have only been build 36, and so
# this is the current assumption for data processing.
build = "b36"
yield "##genome-build " + build

header_row = genotype_data.next()
col = dict()
for i in range(len(header_row)):
col[header_row[i]] = i

for row in genotype_data:
variants = list(row[col['YourCode']])
if variants[0] == '-':
continue
chromosome = 'chr' + row[col['Chromosome']]
strand = row[col['Strand']]
if strand == '-':
variants = [revcomp(x) for x in variants]
pos_start = row[col['Position']]
pos_end = pos_start

attributes = ''
if variants[0] == variants[1]:
attributes = 'alleles ' + variants[0]
else:
attributes = 'alleles ' + variants[0] + '/' + variants[1]
if re.match('rs', row[col['Name']]):
attributes = attributes + '; db_xref dbsnp:' + row[col['Name']]

output = [chromosome, "deCODEme", "SNP", pos_start, pos_end, '.', '+',
'.', attributes]
yield "\t".join(output)

def convert_to_file(genotype_input, output_file):
"""Convert a deCODEme file and output GFF-formatted data to file"""
output = output_file # default assumes writable file object
if isinstance(output_file, str):
output = autozip.file_open(output_file, 'w')
conversion = convert(genotype_input)
for line in conversion:
output.write(line + "\n")
output.close()

def main():
# Parse options
usage = ("\n%prog -i inputfile [-o outputfile]\n"
"%prog [-o outputfile] < inputfile")
parser = OptionParser(usage=usage)
parser.add_option("-i", "--input", dest="inputfile",
help="read deCODEme data from INFILE (automatically "
"uncompress if *.zip, *.gz, *.bz2)", metavar="INFILE")
parser.add_option("-o", "--output", dest="outputfile",
help="write report to OUTFILE (automatically compress "
"if *.gz, or *.bz2)", metavar="OUTFILE")
options, args = parser.parse_args()

# Handle input
if sys.stdin.isatty(): # false if data is piped in
var_input = options.inputfile
else:
var_input = sys.stdin

# Handle output
if options.outputfile:
convert_to_file(var_input, options.outputfile)
else:
for line in convert(var_input):
print line


if __name__ == "__main__":
main()
8 changes: 3 additions & 5 deletions server/genome_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,13 +45,11 @@ def process_source(genome_in, metadata=dict(), options=None):
"""
Open source and return as sorted GFF data.
"""
# Handle genome compression, get input, and make best guess of format type.
source_input = autozip.file_open(genome_in, 'r')
metadata['input_type'] = detect_format.detect_format(source_input)
# Make best guess of format type, to be saved in metadata.
metadata['input_type'] = detect_format.detect_format(genome_in)
print >> sys.stderr, "file format:", metadata['input_type']

# Reset input and reopen, converting to GFF if necessary.
source_input.close()
# Open genetic data, decompressing and converting to GFF if necessary.
gff_input = convert_to_gff.convert(genome_in, options)

# Grab header (don't sort) & genome build. Pipe the rest to UNIX sort.
Expand Down
15 changes: 9 additions & 6 deletions server/utils/autozip.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
import bz2

def file_open(filename, mode='r'):
def file_open(filename, mode='r', arch_file=''):
"""Return file obj, with compression if appropriate extension is given
Arguments:
Expand All @@ -19,15 +19,18 @@ def file_open(filename, mode='r'):
# ZIP compression actions.
if re.search("\.zip", filename):
archive = zipfile.ZipFile(filename, mode)
if not hasattr(archive, 'open'):
raise ImportError ('zipfile.ZipFile.open not available. '
'Upgrade python to 2.6 (or later) to ' +
'work with zip-compressed files!')
if mode == 'r':
files = archive.infolist()
assert len(files) == 1
if hasattr(archive, "open"):
if not arch_file:
assert len(files) == 1, \
'More than one file in ZIP archive, no filename provided.'
return archive.open(files[0])
else:
raise ImportError ('zipfile.ZipFile.open not available. '
'Upgrade python to 2.6 (or later) to ' +
'work with zip-compressed files!')
return archive.open(arch_file)
else:
raise TypeError ('Zip archive only supported for reading.')

Expand Down

0 comments on commit 5d93119

Please sign in to comment.