CHK-25 updated checkfiles handling of pacbio read names and read_name…

…_details (#15)
ENCODE-DCC · Jun 20, 2018 · 6baf4be · 6baf4be
1 parent 65ca6e2
commit 6baf4be
Showing 1 changed file with 152 additions and 88 deletions.
diff --git a/checkfiles.py b/checkfiles.py
@@ -38,6 +38,27 @@
     "wig"
 ]
 
+read_name_prefix = re.compile(
+    '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
+    '+:\d+:\d+:\d+:\d+)$')
+
+read_name_pattern = re.compile(
+    '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
+    '+:\d+:\d+:\d+:\d+[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
+)
+
+special_read_name_pattern = re.compile(
+    '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
+    '+:\d+:\d+:\d+:\d+[/1|/2]*[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
+)
+
+srr_read_name_pattern = re.compile(
+    '^(@SRR[\d.]+)$'
+)
+
+pacbio_read_name_pattern = re.compile(
+    '^(@m\d{6}_\d{6}_\d+_[a-zA-Z\d_-]+\/.*)$|^(@c.+)$'
+)
 
 def is_path_gzipped(path):
     with open(path, 'rb') as f:
@@ -277,6 +298,19 @@ def process_new_illumina_prefix(read_name,
     return old_illumina_current_prefix
 
 
+def process_pacbio_read_name_pattern(
+        read_name,
+        signatures_set,
+        movie_identifier
+        ):
+    arr = re.split(r'/', read_name)
+    if len(arr) > 1:
+        movie_identifier = arr[0]
+        signatures_set.add(
+            'pacbio:0:1::' + movie_identifier)
+    return movie_identifier
+
+
 def process_old_illumina_read_name_pattern(read_name,
                                            read_numbers_set,
                                            signatures_set,
@@ -314,83 +348,116 @@ def process_old_illumina_read_name_pattern(read_name,
 
 
 def process_read_name_line(read_name_line,
-                           read_name_prefix,
-                           read_name_pattern,
-                           special_read_name_pattern,
-                           srr_read_name_pattern,
                            old_illumina_current_prefix,
                            read_numbers_set,
                            signatures_no_barcode_set,
                            signatures_set,
                            read_lengths_dictionary,
-                           errors, srr_flag):
+                           errors, srr_flag, read_name_details):
     read_name = read_name_line.strip()
-    words_array = re.split(r'\s', read_name)
-    if read_name_pattern.match(read_name) is None:
-        if special_read_name_pattern.match(read_name) is not None:
-            process_special_read_name_pattern(read_name,
-                                              words_array,
-                                              signatures_set,
-                                              signatures_no_barcode_set,
-                                              read_numbers_set,
-                                              srr_flag)
-        elif srr_read_name_pattern.match(read_name.split(' ')[0]) is not None:
-            # in case the readname is following SRR format, read number will be
-            # defined using SRR format specifications, and not by the illumina portion of the read name
-            # srr_flag is used to distinguish between srr and "regular" readname formats
-            srr_portion = read_name.split(' ')[0]
-            if srr_portion.count('.') == 2:
-                read_numbers_set.add(srr_portion[-1])
-            else:
-                read_numbers_set.add('1')
-            illumina_portion = read_name.split(' ')[1]
-            old_illumina_current_prefix = process_read_name_line('@'+illumina_portion,
-                                                                 read_name_prefix,
-                                                                 read_name_pattern,
-                                                                 special_read_name_pattern,
-                                                                 srr_read_name_pattern,
-                                                                 old_illumina_current_prefix,
-                                                                 read_numbers_set,
-                                                                 signatures_no_barcode_set,
-                                                                 signatures_set,
-                                                                 read_lengths_dictionary,
-                                                                 errors, True)
+    if read_name_details:
+        #extract fastq signature parts using read_name_detail
+        read_name_array = re.split(r'[:\s]', read_name)
+
+        flowcell = read_name_array[read_name_details['flowcell_id_location']]
+        lane_number = read_name_array[read_name_details['lane_id_location']]
+        if not read_name_details.get('read_number_location'):
+            read_number = 1
         else:
-            # unrecognized read_name_format
-            # current convention is to include WHOLE
-            # readname at the end of the signature
-            if len(words_array) == 1:
-                if read_name_prefix.match(read_name) is not None:
-                    # new illumina without second part
-                    old_illumina_current_prefix = process_new_illumina_prefix(
+            read_number = read_name_array[read_name_details['read_number_location']]
+        read_numbers_set.add(read_number)
+
+        if not read_name_details.get('barcode_location'):
+            barcode_index = ''
+        else:
+            barcode_index = read_name_array[read_name_details['barcode_location']]
+
+        signatures_set.add(
+            flowcell + ':' + lane_number + ':' +
+            read_number + ':' + barcode_index + ':')
+        signatures_no_barcode_set.add(
+            flowcell + ':' + lane_number + ':' +
+            read_number + ':')
+    else:
+        words_array = re.split(r'\s', read_name)
+        if read_name_pattern.match(read_name) is None:
+            if special_read_name_pattern.match(read_name) is not None:
+                process_special_read_name_pattern(read_name,
+                                                words_array,
+                                                signatures_set,
+                                                signatures_no_barcode_set,
+                                                read_numbers_set,
+                                                srr_flag)
+            elif srr_read_name_pattern.match(read_name.split(' ')[0]) is not None:
+                # in case the readname is following SRR format, read number will be
+                # defined using SRR format specifications, and not by the illumina portion of the read name
+                # srr_flag is used to distinguish between srr and "regular" readname formats
+                srr_portion = read_name.split(' ')[0]
+                if srr_portion.count('.') == 2:
+                    read_numbers_set.add(srr_portion[-1])
+                else:
+                    read_numbers_set.add('1')
+                illumina_portion = read_name.split(' ')[1]
+                old_illumina_current_prefix = process_read_name_line('@'+illumina_portion,
+                                                                    old_illumina_current_prefix,
+                                                                    read_numbers_set,
+                                                                    signatures_no_barcode_set,
+                                                                    signatures_set,
+                                                                    read_lengths_dictionary,
+                                                                    errors, True, read_name_details)
+            elif pacbio_read_name_pattern.match(read_name):
+                # pacbio reads include: 
+                # movie identifier that includes the time of run start (m140415_143853)
+                # instrment serial number (42175)
+                # SMRT cell barcode (c100635972550000001823121909121417)
+                # set number
+                # part number
+                # m140415_143853_42175_c100635972550000001823121909121417_s1_p0/....
+                movie_identifier = read_name.split('/')[0]
+                if len(movie_identifier) > 0:
+                    process_pacbio_read_name_pattern(
                         read_name,
                         signatures_set,
-                        old_illumina_current_prefix,
-                        read_numbers_set,
-                        srr_flag)
+                        movie_identifier
+                    )
+                else:
+                    errors['fastq_format_readname'] = read_name   
+            else:
+                # unrecognized read_name_format
+                # current convention is to include WHOLE
+                # readname at the end of the signature
+                if len(words_array) == 1:
+                    if read_name_prefix.match(read_name) is not None:
+                        # new illumina without second part
+                        old_illumina_current_prefix = process_new_illumina_prefix(
+                            read_name,
+                            signatures_set,
+                            old_illumina_current_prefix,
+                            read_numbers_set,
+                            srr_flag)
 
-                elif len(read_name) > 3 and read_name.count(':') > 2:
-                    # assuming old illumina format
-                    old_illumina_current_prefix = process_old_illumina_read_name_pattern(
-                        read_name,
-                        read_numbers_set,
-                        signatures_set,
-                        old_illumina_current_prefix,
-                        srr_flag)
+                    elif len(read_name) > 3 and read_name.count(':') > 2:
+                        # assuming old illumina format
+                        old_illumina_current_prefix = process_old_illumina_read_name_pattern(
+                            read_name,
+                            read_numbers_set,
+                            signatures_set,
+                            old_illumina_current_prefix,
+                            srr_flag)
+                    else:
+                        errors['fastq_format_readname'] = read_name
+                        # the only case to skip update content error - due to the changing
+                        # nature of read names
                 else:
                     errors['fastq_format_readname'] = read_name
-                    # the only case to skip update content error - due to the changing
-                    # nature of read names
-            else:
-                errors['fastq_format_readname'] = read_name
-    # found a match to the regex of "almost" illumina read_name
-    else:
-        process_illumina_read_name_pattern(
-            read_name,
-            read_numbers_set,
-            signatures_set,
-            signatures_no_barcode_set,
-            srr_flag)
+        # found a match to the regex of "almost" illumina read_name
+        else:
+            process_illumina_read_name_pattern(
+                read_name,
+                read_numbers_set,
+                signatures_set,
+                signatures_no_barcode_set,
+                srr_flag)
 
     return old_illumina_current_prefix
 
@@ -407,23 +474,7 @@ def process_fastq_file(job, fastq_data_stream, session, url):
     errors = job['errors']
     result = job['result']
 
-    read_name_prefix = re.compile(
-        '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
-        '+:\d+:\d+:\d+:\d+)$')
-
-    read_name_pattern = re.compile(
-        '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
-        '+:\d+:\d+:\d+:\d+[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
-    )
-
-    special_read_name_pattern = re.compile(
-        '^(@[a-zA-Z\d]+[a-zA-Z\d_-]*:[a-zA-Z\d-]+:[a-zA-Z\d_-]' +
-        '+:\d+:\d+:\d+:\d+[/1|/2]*[\s_][12]:[YXN]:[0-9]+:([ACNTG\+]*|[0-9]*))$'
-    )
-
-    srr_read_name_pattern = re.compile(
-        '^(@SRR[\d.]+)$'
-    )
+    read_name_details = get_read_name_details(job.get('@id'), errors, session, url)
 
     read_numbers_set = set()
     signatures_set = set()
@@ -441,19 +492,19 @@ def process_fastq_file(job, fastq_data_stream, session, url):
             else:
                 line_index += 1
                 if line_index == 1:
+
+                    # may be from here deliver a flag about the presence/absence of the readnamedetails
+
                     old_illumina_current_prefix = \
                         process_read_name_line(
                             line,
-                            read_name_prefix,
-                            read_name_pattern,
-                            special_read_name_pattern,
-                            srr_read_name_pattern,
                             old_illumina_current_prefix,
                             read_numbers_set,
                             signatures_no_barcode_set,
                             signatures_set,
                             read_lengths_dictionary,
-                            errors, False)
+                            errors, False,
+                            read_name_details)
             if line_index == 2:
                 read_count += 1
                 process_sequence_line(line, read_lengths_dictionary)
@@ -594,6 +645,19 @@ def compare_flowcell_details(flowcell_details_1, flowcell_details_2):
     return False
 
 
+def get_read_name_details(job_id, errors, session, url):
+    query = job_id +'?datastore=database&frame=object&format=json'
+    try:
+        r = session.get(urljoin(url, query))
+    except requests.exceptions.RequestException as e:
+        errors['lookup_for_read_name_detaisl'] = ('Network error occured, while looking for '
+                                                  'file read_name details on the portal. {}').format(str(e))
+    else:
+        details = r.json().get('read_name_details')
+        if details:
+            return details
+
+
 def get_platform_uuid(job_id, errors, session, url):
     query = job_id +'?datastore=database&frame=object&format=json'
     try:
@@ -1056,7 +1120,7 @@ def run(out, err, url, username, password, encValData, mirror, search_query, fil
     except multiprocessing.NotImplmentedError:
         nprocesses = 1
 
-    version = '1.20'
+    version = '1.22'
 
     try:
         ip_output = subprocess.check_output(