Skip to content

Commit

Permalink
CHK-25 updated checkfiles handling of pacbio read names and read_name…
Browse files Browse the repository at this point in the history
…_details (#15)
  • Loading branch information
gabdank authored and caseylitton committed Jun 20, 2018
1 parent 65ca6e2 commit 6baf4be
Showing 1 changed file with 152 additions and 88 deletions.
240 changes: 152 additions & 88 deletions checkfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,27 @@
"wig"
]

read_name_prefix = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+)$')

read_name_pattern = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
)

special_read_name_pattern = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+[/1|/2]*[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
)

srr_read_name_pattern = re.compile(
'^(@SRR[\d.]+)$'
)

pacbio_read_name_pattern = re.compile(
'^(@m\d{6}_\d{6}_\d+_[a-zA-Z\d_-]+\/.*)$|^(@c.+)$'
)

def is_path_gzipped(path):
with open(path, 'rb') as f:
Expand Down Expand Up @@ -277,6 +298,19 @@ def process_new_illumina_prefix(read_name,
return old_illumina_current_prefix


def process_pacbio_read_name_pattern(
read_name,
signatures_set,
movie_identifier
):
arr = re.split(r'/', read_name)
if len(arr) > 1:
movie_identifier = arr[0]
signatures_set.add(
'pacbio:0:1::' + movie_identifier)
return movie_identifier


def process_old_illumina_read_name_pattern(read_name,
read_numbers_set,
signatures_set,
Expand Down Expand Up @@ -314,83 +348,116 @@ def process_old_illumina_read_name_pattern(read_name,


def process_read_name_line(read_name_line,
read_name_prefix,
read_name_pattern,
special_read_name_pattern,
srr_read_name_pattern,
old_illumina_current_prefix,
read_numbers_set,
signatures_no_barcode_set,
signatures_set,
read_lengths_dictionary,
errors, srr_flag):
errors, srr_flag, read_name_details):
read_name = read_name_line.strip()
words_array = re.split(r'\s', read_name)
if read_name_pattern.match(read_name) is None:
if special_read_name_pattern.match(read_name) is not None:
process_special_read_name_pattern(read_name,
words_array,
signatures_set,
signatures_no_barcode_set,
read_numbers_set,
srr_flag)
elif srr_read_name_pattern.match(read_name.split(' ')[0]) is not None:
# in case the readname is following SRR format, read number will be
# defined using SRR format specifications, and not by the illumina portion of the read name
# srr_flag is used to distinguish between srr and "regular" readname formats
srr_portion = read_name.split(' ')[0]
if srr_portion.count('.') == 2:
read_numbers_set.add(srr_portion[-1])
else:
read_numbers_set.add('1')
illumina_portion = read_name.split(' ')[1]
old_illumina_current_prefix = process_read_name_line('@'+illumina_portion,
read_name_prefix,
read_name_pattern,
special_read_name_pattern,
srr_read_name_pattern,
old_illumina_current_prefix,
read_numbers_set,
signatures_no_barcode_set,
signatures_set,
read_lengths_dictionary,
errors, True)
if read_name_details:
#extract fastq signature parts using read_name_detail
read_name_array = re.split(r'[:\s]', read_name)

flowcell = read_name_array[read_name_details['flowcell_id_location']]
lane_number = read_name_array[read_name_details['lane_id_location']]
if not read_name_details.get('read_number_location'):
read_number = 1
else:
# unrecognized read_name_format
# current convention is to include WHOLE
# readname at the end of the signature
if len(words_array) == 1:
if read_name_prefix.match(read_name) is not None:
# new illumina without second part
old_illumina_current_prefix = process_new_illumina_prefix(
read_number = read_name_array[read_name_details['read_number_location']]
read_numbers_set.add(read_number)

if not read_name_details.get('barcode_location'):
barcode_index = ''
else:
barcode_index = read_name_array[read_name_details['barcode_location']]

signatures_set.add(
flowcell + ':' + lane_number + ':' +
read_number + ':' + barcode_index + ':')
signatures_no_barcode_set.add(
flowcell + ':' + lane_number + ':' +
read_number + ':')
else:
words_array = re.split(r'\s', read_name)
if read_name_pattern.match(read_name) is None:
if special_read_name_pattern.match(read_name) is not None:
process_special_read_name_pattern(read_name,
words_array,
signatures_set,
signatures_no_barcode_set,
read_numbers_set,
srr_flag)
elif srr_read_name_pattern.match(read_name.split(' ')[0]) is not None:
# in case the readname is following SRR format, read number will be
# defined using SRR format specifications, and not by the illumina portion of the read name
# srr_flag is used to distinguish between srr and "regular" readname formats
srr_portion = read_name.split(' ')[0]
if srr_portion.count('.') == 2:
read_numbers_set.add(srr_portion[-1])
else:
read_numbers_set.add('1')
illumina_portion = read_name.split(' ')[1]
old_illumina_current_prefix = process_read_name_line('@'+illumina_portion,
old_illumina_current_prefix,
read_numbers_set,
signatures_no_barcode_set,
signatures_set,
read_lengths_dictionary,
errors, True, read_name_details)
elif pacbio_read_name_pattern.match(read_name):
# pacbio reads include:
# movie identifier that includes the time of run start (m140415_143853)
# instrment serial number (42175)
# SMRT cell barcode (c100635972550000001823121909121417)
# set number
# part number
# m140415_143853_42175_c100635972550000001823121909121417_s1_p0/....
movie_identifier = read_name.split('/')[0]
if len(movie_identifier) > 0:
process_pacbio_read_name_pattern(
read_name,
signatures_set,
old_illumina_current_prefix,
read_numbers_set,
srr_flag)
movie_identifier
)
else:
errors['fastq_format_readname'] = read_name
else:
# unrecognized read_name_format
# current convention is to include WHOLE
# readname at the end of the signature
if len(words_array) == 1:
if read_name_prefix.match(read_name) is not None:
# new illumina without second part
old_illumina_current_prefix = process_new_illumina_prefix(
read_name,
signatures_set,
old_illumina_current_prefix,
read_numbers_set,
srr_flag)

elif len(read_name) > 3 and read_name.count(':') > 2:
# assuming old illumina format
old_illumina_current_prefix = process_old_illumina_read_name_pattern(
read_name,
read_numbers_set,
signatures_set,
old_illumina_current_prefix,
srr_flag)
elif len(read_name) > 3 and read_name.count(':') > 2:
# assuming old illumina format
old_illumina_current_prefix = process_old_illumina_read_name_pattern(
read_name,
read_numbers_set,
signatures_set,
old_illumina_current_prefix,
srr_flag)
else:
errors['fastq_format_readname'] = read_name
# the only case to skip update content error - due to the changing
# nature of read names
else:
errors['fastq_format_readname'] = read_name
# the only case to skip update content error - due to the changing
# nature of read names
else:
errors['fastq_format_readname'] = read_name
# found a match to the regex of "almost" illumina read_name
else:
process_illumina_read_name_pattern(
read_name,
read_numbers_set,
signatures_set,
signatures_no_barcode_set,
srr_flag)
# found a match to the regex of "almost" illumina read_name
else:
process_illumina_read_name_pattern(
read_name,
read_numbers_set,
signatures_set,
signatures_no_barcode_set,
srr_flag)

return old_illumina_current_prefix

Expand All @@ -407,23 +474,7 @@ def process_fastq_file(job, fastq_data_stream, session, url):
errors = job['errors']
result = job['result']

read_name_prefix = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+)$')

read_name_pattern = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
)

special_read_name_pattern = re.compile(
'^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
'+:\d+:\d+:\d+:\d+[/1|/2]*[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
)

srr_read_name_pattern = re.compile(
'^(@SRR[\d.]+)$'
)
read_name_details = get_read_name_details(job.get('@id'), errors, session, url)

read_numbers_set = set()
signatures_set = set()
Expand All @@ -441,19 +492,19 @@ def process_fastq_file(job, fastq_data_stream, session, url):
else:
line_index += 1
if line_index == 1:

# may be from here deliver a flag about the presence/absence of the readnamedetails

old_illumina_current_prefix = \
process_read_name_line(
line,
read_name_prefix,
read_name_pattern,
special_read_name_pattern,
srr_read_name_pattern,
old_illumina_current_prefix,
read_numbers_set,
signatures_no_barcode_set,
signatures_set,
read_lengths_dictionary,
errors, False)
errors, False,
read_name_details)
if line_index == 2:
read_count += 1
process_sequence_line(line, read_lengths_dictionary)
Expand Down Expand Up @@ -594,6 +645,19 @@ def compare_flowcell_details(flowcell_details_1, flowcell_details_2):
return False


def get_read_name_details(job_id, errors, session, url):
query = job_id +'?datastore=database&frame=object&format=json'
try:
r = session.get(urljoin(url, query))
except requests.exceptions.RequestException as e:
errors['lookup_for_read_name_detaisl'] = ('Network error occured, while looking for '
'file read_name details on the portal. {}').format(str(e))
else:
details = r.json().get('read_name_details')
if details:
return details


def get_platform_uuid(job_id, errors, session, url):
query = job_id +'?datastore=database&frame=object&format=json'
try:
Expand Down Expand Up @@ -1056,7 +1120,7 @@ def run(out, err, url, username, password, encValData, mirror, search_query, fil
except multiprocessing.NotImplmentedError:
nprocesses = 1

version = '1.20'
version = '1.22'

try:
ip_output = subprocess.check_output(
Expand Down

0 comments on commit 6baf4be

Please sign in to comment.