diff --git a/bagitutils.py b/bagitutils.py index 1e679d4..3cdc2dc 100644 --- a/bagitutils.py +++ b/bagitutils.py @@ -1,25 +1,115 @@ #!/usr/bin/env python - import shutil +import operator import zipfile import os +from StringIO import StringIO + import bagit -import pandas as pd -import numpy as np -import re import tempfile +import csv + + +# Some column names from Boardwalk manifest. These are the ones that require +# special handling when converting into the FireCloud scheme. +class BoardwalkColumns: + def __init__(self): + pass + + SAMPLE_UUID = 'Sample UUID' + DONOR_UUID = 'Donor UUID' + FILE_TYPE = 'File Type' + FILE_URLS = 'File URLs' + SPECIMEN_UUID = 'Specimen UUID' + FILE_DOS_URI = 'File DOS URI' + FILE_PATH = 'File Path' + UPLOAD_FILE_ID = 'Upload File ID' + + +# Column names in Boardwalk that are file related, except for FILE_URLS, +# which is extra special because its value is a comma-separated list. +FILE_COLUMNS = [ + BoardwalkColumns.FILE_DOS_URI, + BoardwalkColumns.FILE_TYPE, + BoardwalkColumns.FILE_PATH, + BoardwalkColumns.UPLOAD_FILE_ID +] + +# Column names in Boardwalk that cannot simply be copied over to FireCloud; +# they require extra logic. +COMPLEX_COLUMNS = FILE_COLUMNS + [ + BoardwalkColumns.SAMPLE_UUID, + BoardwalkColumns.FILE_URLS +] + + +class RequiredFirecloudColumns: + """ + Columns must be present in FireCloud TSVs. The TSVs can contain additional + columns, but these minimal columns must be present. + """ + def __init__(self): + pass + + # The column in the participant.tsv + PARTICIPANT_ENTITY_ID = 'entity:participant_id' + + # Columns in sample.tsv + SAMPLE_SAMPLE_ID = 'entity:sample_id' + SAMPLE_PARTICIPANT = 'participant' class BagHandler: """ - Handles data in BagIt data structure. + From a Boardwalk manifest, generates a zip file with the contents in a bagit + format, where the bagit contains two TSVs that can be uploaded to FireCloud. + + The Boardwalk manifest is a single TSV. Each row in the manifest corresponds + to a file. Several files can be part of the same sample, meaning a sample + can be spread across multiple rows. + + For FireCloud, the data needs to be broken up into two TSVs, a participant + and a sample TSV. + + The participant TSV is a one column TSV with the unique participant UUIDs. + + The sample TSV has one row per sample, linked to the participant TSV by + a participant column. Because a sample may contain multiple files, each + file for a sample is added as an additional column. + + Simplified example Boardwalk TSV + + DONOR UUID SAMPLE UUID FILE + d1 s1 f1 + d1 s1 f2 + d2 s2 f3 + + This gets transformed to a participant TSV, with the two unique donors: + + entity:participant_id + d1 + d2 + + And a sample TSV, with the two samples, linked to participant.tsv by the + participant column: + + entity:sample_id participant file1 file2 + s1 d1 f1 f2 + s2 d2 f3 + + In FireCloud, the name of column "entity:participant_id" in + participant.tsv, and the name of the columns "entity:sample_id" and + "participant" in sample.tsv must be exactly those. Additional columns in + sample can have any name, although the convention seems to be lower + case with underscores, so we convert the Boardwalk column names to follow + that convention. + + In this example, the file2 column is empty for the second row. That is + because the different samples can have different numbers of files. """ + def __init__(self, data, bag_name, bag_info): - # Create Pandas dataframe from tab-separated values. - if isinstance(data, pd.core.frame.DataFrame): - self.data = data - else: - self.data = pd.read_csv(data, sep='\t') + self.data = data self.name = bag_name self.info = bag_info @@ -33,115 +123,198 @@ def create_bag(self): data_path = bag_dir + '/data' os.makedirs(data_path) bag = bagit.make_bag(bag_dir, self.info) - self._reformat_headers() - participant, sample = self.transform() - - participant.to_csv(path=data_path + '/participant.tsv', - sep='\t', - index=False, - header=True) - sample.to_csv(path_or_buf=data_path + '/sample.tsv', - sep='\t', - index=False, - header=True) + + self.write_csv_files(data_path) + # Write BagIt to disk and create checksum manifests. bag.save(manifests=True) + # Compress bag. zipfile_tmp = tempfile.NamedTemporaryFile(suffix='.zip', delete=False) zipfile_handle = zipfile.ZipFile(zipfile_tmp, 'w', zipfile.ZIP_DEFLATED) - self.__zipdir(tempd, zipfile_handle) + self._zipdir(tempd, zipfile_handle) zipfile_handle.close() shutil.rmtree(tempd, True) return zipfile_tmp.name - def __zipdir(self, path, zip_fh): + @staticmethod + def _zipdir(path, zip_fh): # zip_fh is zipfile handle - pathLength = len(path) + path_length = len(path) for root, dirs, files in os.walk(path): for file in files: - zip_fh.write(os.path.join(root, file), arcname=root[pathLength:] + '/' + file) - - def _reformat_headers(self): - """Removes whitespace and dots in column names, and sets - all header strings to lower case.""" - df = self.data - # Remove all spaces from column headers and make lower case. - df.rename(columns=lambda x: x.replace(" ", "_"), inplace=True) - df.rename(columns=lambda x: x.replace(".", "_"), inplace=True) - df.rename(columns=lambda x: x.lower(), inplace=True) - - def transform(self): - """Transforms dataframe df for FireCloud upload and returns - two dataframes, a tuple of participant and sample, which are then - uploaded to FireCloud in that order. + zip_fh.write(os.path.join(root, file), + arcname=root[path_length:] + '/' + file) + + def write_csv_files(self, data_path): """ - df = self.data - # Start normalizing the table. First, slice by file type. - df1 = df[df['file_type'] == 'crai'] - # Extract three columns from df with file type 'cram': - df2 = df[['file_type', - 'file_path', - 'upload_file_id', - 'file_urls', - 'file_dos_url']][df['file_type'] == 'cram'] - df2.rename(index=str, - columns={'file_type': 'file_type2', - 'file_path': 'file_path2', - 'upload_file_id': 'upload_file_id2', - 'file_urls': 'file_urls2', - 'file_dos_url': 'file_dos_url2'}, - inplace=True) - frames = [df1, df2] # merge both frames - for frame in frames: - frame.reset_index(drop=True, inplace=True) - # Second, by combining df1 and df2 we obtain a normalized table, - # using the index from df1. - df_new = pd.concat(frames, axis=1, join_axes=[df1.index]) - df_new.drop_duplicates(keep='first', inplace=True) - # Create a table with only one column (donor will be participant - # in FC). - participant = df_new['donor_uuid'] # extract one column - participant.name = 'entity:participant_id' # rename column header - - # Re-order index of dataframe to be compliant with FireCloud - # specifications. - new_index = ([11, 4, 3, 7, 5, 6, 8, 9, 10, 12, 13, 14] + - [0, 1, 2, 18, 19, 15, 16, 17, 20, 21, 22] + - [23, 24, 25, 26]) - L = df_new.columns.tolist() - new_col_order = [L[x] for x in new_index] - sample = df_new.reindex(columns=new_col_order) - sample = sample.rename( - index=str, - columns={'sample_uuid': 'entity:sample_id', - 'donor_uuid': 'participant_id', - 'file_type': 'file_type1', - 'file_path': 'file_path1', - 'upload_file_id': 'upload_file_id1', - 'file_urls': 'file_urls1', - 'file_dos_url': 'file_dos_url1', - 'metadata.json': 'metadata_json'}) - return participant, sample - - def __normalize(self): + Generates and writes participant.tsv and sample.tsv to data_path + directory. + :param data_path: Where to write the files + :return: None """ - Normalizes dataframe to First Normal Form (1NF) such that it - contains only unique entries of donors IDs so it can be used - as primary key. Part of that is creating new columns with new - column names of those records that are duplicate. - :returns df: (Pandas dataframe) normalized + participants, samples = self.convert_to_participant_and_sample() + + with open(data_path + '/participant.tsv', 'w') as tsv: + writer = csv.DictWriter(tsv, fieldnames=[ + RequiredFirecloudColumns.PARTICIPANT_ENTITY_ID], delimiter='\t') + writer.writeheader() + for p in participants: + writer.writerow( + {RequiredFirecloudColumns.PARTICIPANT_ENTITY_ID: p}) + + with open(data_path + '/sample.tsv', 'w') as tsv: + first_row = True + for sample in samples: + if first_row: + first_row = False + keys = sample.keys() + # entity:sample_id must be first + keys.remove(RequiredFirecloudColumns.SAMPLE_SAMPLE_ID) + fieldnames = [ RequiredFirecloudColumns.SAMPLE_SAMPLE_ID]\ + + sorted(keys) + writer = csv.DictWriter(tsv, fieldnames=fieldnames, + delimiter='\t') + writer.writeheader() + writer.writerow(sample) + + def convert_to_participant_and_sample(self): + participants, max_samples, native_protocols = \ + self.participants_and_max_files_in_sample_and_protocols() + return list(participants), self.samples(max_samples, native_protocols) + + def participants_and_max_files_in_sample_and_protocols(self): + """ + Does one pass through the CSV, calculating the unique participants, + the maximum number of files for any one specimen, and the total number + of cloud native protocols being used. + :return: a tuple with a set of participants, the maximum number of + files in any one sample, and a set of the unique cloud native protocols. + """ + reader = csv.DictReader(StringIO(self.data), delimiter='\t') + participants = set() + native_protocols = set() + specimens = {} # key: specimen UUID, value count + for row in reader: + # Add all participants. It's a set, so no dupes + participants.add(row[BoardwalkColumns.DONOR_UUID]) + + specimen_uuid = row[BoardwalkColumns.SPECIMEN_UUID] + if specimen_uuid in specimens: + specimens[specimen_uuid] = specimens[specimen_uuid] + 1 + else: + specimens[specimen_uuid] = 1 + + # Track all the different cloud native url protocols + for file_url in row[BoardwalkColumns.FILE_URLS].split(','): + protocol = self.native_url_protocol(file_url) + if protocol is not None: + native_protocols.add(protocol) + + return participants, max(specimens.values()), native_protocols + + def samples(self, max_files_in_sample, native_protocols): + """ + Creates a list of dicts, dict is a row in the sample TSV for FireCloud. + For all rows of the same sample in the input, create one row only, + where the file-specific data from each row is appended as additional + columns to the one row. + + The input is self.data. Requires that data be sorted by + BoardwalkColumns.SAMPLE_UUID; this routine sorts it. If data could be + sorted before being passed to this method, then we should remove + sorting in here. + + :param max_files_in_sample: the maximum number of files in sample + :param native_protocols: all the unique native protocols in the data + :return: a list of dicts + """ + reader = csv.DictReader(StringIO(self.data), delimiter='\t') + samples = [] + + current_specimen_uuid = None + current_row = None + + for row in sorted(reader, key=operator.itemgetter( + BoardwalkColumns.SPECIMEN_UUID)): + specimen_uuid = row[BoardwalkColumns.SPECIMEN_UUID] + if specimen_uuid != current_specimen_uuid: + current_specimen_uuid = specimen_uuid + index = 1 + if current_row is not None: + samples.append(current_row) + current_row = self.init_sample_row(row, max_files_in_sample, + native_protocols) + else: + index = index + 1 + + self.add_files_to_row(current_row, row, str(index)) + + if current_row is not None: + samples.append(current_row) + return samples + + def add_files_to_row(self, new_row, existing_row, suffix): + """ + Takes the file-specific columns of existing_row, and adds them as + new columns to new_row. + :param new_row: + :param existing_row: + :param suffix: + :return: + """ + file_urls = existing_row[BoardwalkColumns.FILE_URLS].split(',') + for file_url in file_urls: + protocol = self.native_url_protocol(file_url) + if protocol is not None: + new_row[self.native_column_name(protocol, suffix)] = file_url + + for column in FILE_COLUMNS: + if column in existing_row: + new_row[self.firecloud_column_name(column) + suffix] = \ + existing_row[column] + + def init_sample_row(self, existing_row, max_files_in_sample, + native_protocols): """ - df = self.data - # Get list of all column names. - col_names = [col for col in df] - # Constrain that list to those column names that hold file info. - L = [s for s in col_names if bool(re.search('[Ff]ile', s))] - # file_type = "".join(str(s) for s in L) - - nrecords = len(df['donor_uuid'].unique()) # number of donors - filetype = df['file_type'].unique() # create list of filetypes - for idx, item in enumerate(L): - print(item,) - a = np.repeat((filetype[0]), nrecords) - return df + Create and initialize a sample row + :param existing_row: the existing row + :param max_files_in_sample: the maximum number of files in a sample + :param native_protocols: + :return: the initialized row + """ + # Rename sample column and participant + row = {RequiredFirecloudColumns.SAMPLE_SAMPLE_ID: existing_row[ + BoardwalkColumns.SAMPLE_UUID], + RequiredFirecloudColumns.SAMPLE_PARTICIPANT: existing_row[ + BoardwalkColumns.DONOR_UUID]} + + # Copy rows that don't need transformation, other than FC naming + # conventions + for key, value in existing_row.iteritems(): + if key not in COMPLEX_COLUMNS: + row[self.firecloud_column_name(key)] = value + + # Initialize columns for files and cloud native urls + for suffix in [str(i) for i in range(1, max_files_in_sample + 1)]: + for column in FILE_COLUMNS: + row[self.firecloud_column_name(column) + suffix] = None + + for native_protocol in native_protocols: + row[self.native_column_name(native_protocol, suffix)] = None + return row + + @staticmethod + def native_url_protocol(url): + index = url.find('://') + if index > 0: + return url[:index] + + @staticmethod + def native_column_name(native_protocol, suffix): + return native_protocol + '_url' + suffix + + @staticmethod + def firecloud_column_name(column): + return column.lower().replace(' ', '_').replace('.', '_') diff --git a/requirements.txt b/requirements.txt index f5e0856..87a0c6e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,7 +37,6 @@ lxml==3.6.4 Mako==1.0.6 MarkupSafe==0.23 packaging==16.8 -pandas==0.22 psycopg2==2.7.4 pyasn1==0.2.2 pycparser==2.17 @@ -57,5 +56,4 @@ SQLAlchemy==1.1.5 texttable==0.8.7 tornado==4.4.2 tzlocal==1.3 -urllib3==1.20 Werkzeug==0.11.15 diff --git a/test/manifest_with_crai_cram_bai.tsv b/test/manifest_with_crai_cram_bai.tsv new file mode 100644 index 0000000..4e8ad37 --- /dev/null +++ b/test/manifest_with_crai_cram_bai.tsv @@ -0,0 +1,29 @@ +Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URI +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO5009365 c2ae54f0-eb16-50d6-b511-e85db0d103a5 Blood SRS1231161 71557d46-f8eb-547a-924a-d5e9f8623fa2 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD106415 adc186d7-32b6-55f3-806d-5bb85ce94c83 alignment topmed-spinnaker Alpha Build 1 cram NWD106415.b38.irc.v1.cram fff5a29f-d184-4e3b-9c5b-6f44aea7f527 b5dc2e31-8d75-5da9-b4a2-ba5061492d8d gs://topmed-irc-share/genomes/NWD106415.b38.irc.v1.cram,s3://nih-nhlbi-datacommons/NWD106415.b38.irc.v1.cram dos://dos-dss.ucsc-cgp-dev.org/fff5a29f-d184-4e3b-9c5b-6f44aea7f527?version=2018-02-28T033124.129027Z +TOPMed 1000 Genomes UW NWD259170 066c6bb5-7c8c-51ea-b0fb-0c07b105c9b7 B-lymphocyte HG01110 df2b221c-e8c1-5241-9c53-327cf103034e Normal - blood derived WGS HG01110_sample 9f1e5d7d-90f8-57c6-8ccb-ca1d89d34611 sequence_upload spinnaker 1.1.2 crai NWD259170.recab.cram.crai 46c8a5f1-15ab-48fa-8d1c-63099422e3c7 0d6371a8-fc4f-5232-9660-e655903b17ea s3://commons-dss-commons/blobs/1c7d249c9123007d693857eab3dd4646bc8d742e76c6716c80debbbdd5d48e8b.be947abb597d1a21f2da9d97d96f58e7ca07a214.b933d8fb97268c951e610b6dfa20924d.6fbcd0b3 dos://dos-dss.ucsc-cgp-dev.org/46c8a5f1-15ab-48fa-8d1c-63099422e3c7?version=2018-01-31T081722.854147Z +TOPMed 1000 Genomes WashU NWD100953 4ff60ff5-a1d6-557b-b4ed-3220f62a1b02 B-lymphocyte HG01110 37865db0-0a24-5b4f-937f-00871886906b Normal - blood derived WGS HG01110_sample c4ffc283-b53e-5559-9d9f-3238ac281d86 sequence_upload spinnaker 1.1.3 crai NWD100953.recab.cram.crai a62ee491-489d-405a-8a3b-83765f9e91fb 44a8837b-4456-5709-b56b-54e23000f13a s3://commons-dss-commons/blobs/93e04a07d92ea6732440484ba3b3d2a4841f34c92bd2e41c724d2002e94b44b5.09fcf961a8e3abba39e9a237a104c50668329356.3b0f63a815384a3d44c61b4abd40caf9.a75a2c4b dos://dos-dss.ucsc-cgp-dev.org/a62ee491-489d-405a-8a3b-83765f9e91fb?version=2018-01-31T152803.900629Z +TOPMed HapMap Baylor NWD875673 149fd7b7-1c11-593b-9625-c20f279f68ff B-lymphocyte NA12878 0e448c1f-81ac-5054-8069-5868469d5308 Normal - solid tissue WGS NA12878_sample 7a2e07bc-41b9-5c19-869f-9dcd92892bee sequence_upload spinnaker 1.1.2 cram NWD875673.recab.cram 94694564-bdd3-43dd-9af7-d9c055fd0773 2277b3fc-5a75-5782-86a0-c29f13844e7d s3://commons-dss-commons/blobs/170f6ea09964e5f0442ac9c2ae997659ddf3bb18ef18cb8936d8db7672efb59f.77c0ce859e1a89e5eba95eeca790257cd033eb3e.15ba273d28a75721f412012f33d9c45e-343.455e8975 dos://dos-dss.ucsc-cgp-dev.org/94694564-bdd3-43dd-9af7-d9c055fd0773?version=2018-01-31T092944.494586Z +TOPMed HapMap Baylor NWD875673 149fd7b7-1c11-593b-9625-c20f279f68ff B-lymphocyte NA12878 0e448c1f-81ac-5054-8069-5868469d5308 Normal - solid tissue WGS NA12878_sample 7a2e07bc-41b9-5c19-869f-9dcd92892bee sequence_upload spinnaker 1.1.2 crai NWD875673.recab.cram.crai b8906dd1-2117-4679-8ac7-4ea6cdc045f1 2277b3fc-5a75-5782-86a0-c29f13844e7d s3://commons-dss-commons/blobs/df80c4043ce92ba3fb36ff6896f6830585e572358efd00e17d460ff6705824b5.3b7a821337628c2e90d8736a7b8b89f0229168d1.162e9d9e87bce7bae5d15438721fb8a0.830013b8 dos://dos-dss.ucsc-cgp-dev.org/b8906dd1-2117-4679-8ac7-4ea6cdc045f1?version=2018-01-31T093033.045825Z +TOPMed HapMap Broad NWD768309 7d678b22-314d-54c9-8dd3-7fdfdd6bec89 B-lymphocyte NA12878 af5eceac-72b6-5e90-a822-ee2e8d12ec55 Normal - blood WGS NA12878_sample 250da12d-c7f1-5a2c-8573-1c1c683d6d33 sequence_upload spinnaker 1.1.2 cram NWD768309.recab.cram 6b1b029e-789b-47cd-9aa4-7f7c38e612fd 1ecf1c35-9e1e-55ef-8f42-71102c3abc33 s3://commons-dss-commons/blobs/e596a82a48f753c030ba1026aa8752ad149cae0d6a8c0bc46de64463ef7ef8db.975b8117f41f20157bc5418f44a2fe414605ca4e.170d40e2e02088f34943a39964ebef2f-308.9d1ea3b9 dos://dos-dss.ucsc-cgp-dev.org/6b1b029e-789b-47cd-9aa4-7f7c38e612fd?version=2018-01-31T142525.164592Z +TOPMed HapMap Broad NWD768309 7d678b22-314d-54c9-8dd3-7fdfdd6bec89 B-lymphocyte NA12878 af5eceac-72b6-5e90-a822-ee2e8d12ec55 Normal - blood WGS NA12878_sample 250da12d-c7f1-5a2c-8573-1c1c683d6d33 sequence_upload spinnaker 1.1.2 crai NWD768309.recab.cram.crai e1f2f1ec-eff6-42f6-92ad-51cea0c165f8 1ecf1c35-9e1e-55ef-8f42-71102c3abc33 s3://commons-dss-commons/blobs/68e41018e32ed6a8cb835d41a3d30c5cb130e05d20c377a14d9884bd2ac423d6.e9649dcc7f6174ec519988421cd4af4c63bbf5a6.3918dcc5e0bb2c93fefe3daba4f8eeef.9873b877 dos://dos-dss.ucsc-cgp-dev.org/e1f2f1ec-eff6-42f6-92ad-51cea0c165f8?version=2018-01-31T142525.877824Z +TOPMed HapMap NYGC NWD119836 bc6e1fd7-229d-5e65-a5a2-a15fee0613c0 B-lymphocyte NA12878 8628f32f-d6fd-5419-a364-242a11abebb5 Normal - solid tissue WGS NA12878_sample 8ca820a2-d182-580c-9572-636f9f0eae62 sequence_upload spinnaker 1.1.2 crai NWD119836.recab.cram.crai 693dc20d-a6bf-4334-857c-6a496803b34a 204cf1bd-1477-57e6-880c-1b863edac627 s3://commons-dss-commons/blobs/597d139565b45176509fc7a4f3fc7066cac626ce50d80ad2a6b643eb0d9b4a5a.dda8235ca55396f9ab5cd2fb9e61d43dd796e7da.0ea01635527c738a5f3bf82acf0c3859.dcc419cd dos://dos-dss.ucsc-cgp-dev.org/693dc20d-a6bf-4334-857c-6a496803b34a?version=2018-01-31T142528.027169Z +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO1035541 dae1e053-1a58-5cea-a168-cbfc7cebe679 Blood SRS1231232 bfcc3266-340a-5751-8db1-d661163ac8e5 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD145710 a3e869a6-2cc1-5fc5-aa4a-c6e9d071ff38 alignment topmed-spinnaker Alpha Build 1 cram NWD145710.b38.irc.v1.cram ab4c0815-a366-47b8-b94f-626458d43859 d8fe0ae3-efa2-59c3-9e70-1b164ca868b3 gs://topmed-irc-share/genomes/NWD145710.b38.irc.v1.cram,s3://nih-nhlbi-datacommons/NWD145710.b38.irc.v1.cram dos://dos-dss.ucsc-cgp-dev.org/ab4c0815-a366-47b8-b94f-626458d43859?version=2018-02-28T051204.497736Z +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO1035541 dae1e053-1a58-5cea-a168-cbfc7cebe679 Blood SRS1231232 bfcc3266-340a-5751-8db1-d661163ac8e5 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD145710 a3e869a6-2cc1-5fc5-aa4a-c6e9d071ff38 alignment topmed-spinnaker Alpha Build 1 crai NWD145710.b38.irc.v1.cram.crai f0b142de-e19a-4771-bd6c-d7c5a11d2a43 d8fe0ae3-efa2-59c3-9e70-1b164ca868b3 s3://nih-nhlbi-datacommons/NWD145710.b38.irc.v1.cram.crai,gs://topmed-irc-share/genomes/NWD145710.b38.irc.v1.cram.crai dos://dos-dss.ucsc-cgp-dev.org/f0b142de-e19a-4771-bd6c-d7c5a11d2a43?version=2018-02-28T051206.328525Z +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO5009365 c2ae54f0-eb16-50d6-b511-e85db0d103a5 Blood SRS1231161 71557d46-f8eb-547a-924a-d5e9f8623fa2 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD106415 adc186d7-32b6-55f3-806d-5bb85ce94c83 alignment topmed-spinnaker Alpha Build 1 crai NWD106415.b38.irc.v1.cram.crai 5eec1df5-408d-413b-9d46-87a587e2b8fc b5dc2e31-8d75-5da9-b4a2-ba5061492d8d gs://topmed-irc-share/genomes/NWD106415.b38.irc.v1.cram.crai,s3://nih-nhlbi-datacommons/NWD106415.b38.irc.v1.cram.crai dos://dos-dss.ucsc-cgp-dev.org/5eec1df5-408d-413b-9d46-87a587e2b8fc?version=2018-02-28T033125.424703Z +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO8055779 d58e246b-5cc1-5d5d-ac8c-cdeedb54d81b Blood SRS1231088 47a167c5-08b9-507f-9b6e-5c252ea89683 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD321156 b669582f-2ab4-58c5-8ac7-ede4a5149c07 alignment topmed-spinnaker Alpha Build 1 cram NWD321156.b38.irc.v1.cram 34171382-93f5-42e4-85d8-ebd8efdf2440 2f0a2f5f-5a47-5728-9375-51ecc7fb665d gs://topmed-irc-share/genomes/NWD321156.b38.irc.v1.cram,s3://nih-nhlbi-datacommons/NWD321156.b38.irc.v1.cram dos://dos-dss.ucsc-cgp-dev.org/34171382-93f5-42e4-85d8-ebd8efdf2440?version=2018-02-28T051322.937836Z +TOPMed HapMap NYGC NWD293295 b8284a5b-429d-5652-8247-0257f1e2f61d B-lymphocyte NA19238 58036d43-ec02-59f1-a6d8-83e7f666d90a Normal - solid tissue WGS NA19238_sample 7c94077f-ea7c-5e9a-99fe-d13cac77a61d sequence_upload spinnaker 1.1.2 cram NWD293295.recab.cram 87174c6a-ec98-44a4-9200-4292daa5b185 06c4bd47-c8e2-5045-8bae-bfad24633c87 s3://commons-dss-commons/blobs/19e5620579898ace0db2135e0434daba4f48edf72c5dbf82bfc1ad173161ff71.e1137f4a813e4d18387a799a5f88bb5d300c2cd6.aa81284302982b0f755d1238c3349762-336.feb986d1 dos://dos-dss.ucsc-cgp-dev.org/87174c6a-ec98-44a4-9200-4292daa5b185?version=2018-01-31T081712.534262Z +TOPMed HapMap UW NWD578417 a89a200d-e6f4-513a-b634-b1f7e10cf57a B-lymphocyte NA12878 dbe15848-e076-5517-a1c4-9d2cdbd6b4b5 Normal - blood derived WGS NA12878_sample bb6d255e-f3fa-5bb5-8333-872ff60e491e sequence_upload spinnaker 1.1.2 crai NWD578417.recab.cram.crai 8dd791a7-c6c9-4874-b110-b7b82656afe1 139f30ba-62d3-50fb-9177-ab3d370e29f8 s3://commons-dss-commons/blobs/7253c293c7112d801a7d1278ba6518ef283b2a8c3177a9b7c149f441892ec2c7.84393859a40b0277c70c8f32c2e833d66904beec.327ee0c5c1f606ce22a753d82b76285e.6539dadd dos://dos-dss.ucsc-cgp-dev.org/8dd791a7-c6c9-4874-b110-b7b82656afe1?version=2018-01-31T142517.088013Z +TOPMed HapMap NYGC NWD119836 bc6e1fd7-229d-5e65-a5a2-a15fee0613c0 B-lymphocyte NA12878 8628f32f-d6fd-5419-a364-242a11abebb5 Normal - solid tissue WGS NA12878_sample 8ca820a2-d182-580c-9572-636f9f0eae62 sequence_upload spinnaker 1.1.2 cram NWD119836.recab.cram c3701573-8cc4-4da8-81ae-d3cb5dd67083 204cf1bd-1477-57e6-880c-1b863edac627 s3://commons-dss-commons/blobs/16b8bb72660b1612bfc14c6967124daf36695fa2c48a85c027b1ae56b557f4b1.4c40c1c34375fec97bea30343436e9d39c64c696.7b21891b5cfa4e8982fd55816d966191-308.2837f9a7 dos://dos-dss.ucsc-cgp-dev.org/c3701573-8cc4-4da8-81ae-d3cb5dd67083?version=2018-01-31T092110.375884Z +NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 crai NWD692354.b38.irc.v1.cram.crai 5a00cc38-2f8d-4d34-98e0-0a847579b988 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram.crai,s3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram.crai dos://dos-dss.ucsc-cgp-dev.org/5a00cc38-2f8d-4d34-98e0-0a847579b988?version=2018-02-28T160411.061319Z +NHLBI TOPMed: Boston Early-Onset COPD Study in the TOPMed Program COPD UW EO8055779 d58e246b-5cc1-5d5d-ac8c-cdeedb54d81b Blood SRS1231088 47a167c5-08b9-507f-9b6e-5c252ea89683 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD321156 b669582f-2ab4-58c5-8ac7-ede4a5149c07 alignment topmed-spinnaker Alpha Build 1 crai NWD321156.b38.irc.v1.cram.crai 1c9db311-4015-4ae2-9cc2-99a5f98d96ba 2f0a2f5f-5a47-5728-9375-51ecc7fb665d gs://topmed-irc-share/genomes/NWD321156.b38.irc.v1.cram.crai,s3://nih-nhlbi-datacommons/NWD321156.b38.irc.v1.cram.crai dos://dos-dss.ucsc-cgp-dev.org/1c9db311-4015-4ae2-9cc2-99a5f98d96ba?version=2018-02-28T051324.619728Z +TOPMed 1000 Genomes UW NWD259170 066c6bb5-7c8c-51ea-b0fb-0c07b105c9b7 B-lymphocyte HG01110 df2b221c-e8c1-5241-9c53-327cf103034e Normal - blood derived WGS HG01110_sample 9f1e5d7d-90f8-57c6-8ccb-ca1d89d34611 sequence_upload spinnaker 1.1.2 cram NWD259170.recab.cram 670b2f03-8444-4e14-a13b-f804b56a4c4c 0d6371a8-fc4f-5232-9660-e655903b17ea s3://commons-dss-commons/blobs/8342b0cee5cc14d4dcc33bbc7b29d0bd79a18f8d4c0a29d9d365596485d42649.036a40effc41791699de5ec159b3e84505bdef5a.e9ad2e5c1532f49064d3c17e16a28dd0-262.afc07a01 dos://dos-dss.ucsc-cgp-dev.org/670b2f03-8444-4e14-a13b-f804b56a4c4c?version=2018-01-31T081722.372972Z +TOPMed HapMap NYGC NWD293295 b8284a5b-429d-5652-8247-0257f1e2f61d B-lymphocyte NA19238 58036d43-ec02-59f1-a6d8-83e7f666d90a Normal - solid tissue WGS NA19238_sample 7c94077f-ea7c-5e9a-99fe-d13cac77a61d sequence_upload spinnaker 1.1.2 crai NWD293295.recab.cram.crai 10c8bed3-4395-490a-9f75-0cb5cc991c7a 06c4bd47-c8e2-5045-8bae-bfad24633c87 s3://commons-dss-commons/blobs/94e5eb2718e4bd776f2527f91794ef8511434ee6eec917c3f0b8177986896296.8b629733a8fd135a9dacd6264801184808070131.3a225f58729ac12d2d8e9142636ba6c9.eb66a4cf dos://dos-dss.ucsc-cgp-dev.org/10c8bed3-4395-490a-9f75-0cb5cc991c7a?version=2018-01-31T081713.079689Z +TOPMed 1000 Genomes WashU NWD100953 4ff60ff5-a1d6-557b-b4ed-3220f62a1b02 B-lymphocyte HG01110 37865db0-0a24-5b4f-937f-00871886906b Normal - blood derived WGS HG01110_sample c4ffc283-b53e-5559-9d9f-3238ac281d86 sequence_upload spinnaker 1.1.3 cram NWD100953.recab.cram cdf3ec65-0c1a-4107-8200-8e8d5d2b5b14 44a8837b-4456-5709-b56b-54e23000f13a s3://commons-dss-commons/blobs/3e676baceae3698ae35d63a56a471deef12e2aff5581e93b1716d6cc9f0d0c4d.4a704dd7cb65a4465dc0f687a70783731603ef87.ac94763859e218da0be7e3ea25786bb1-559.ff76a67c dos://dos-dss.ucsc-cgp-dev.org/cdf3ec65-0c1a-4107-8200-8e8d5d2b5b14?version=2018-01-31T152458.523248Z +TOPMed HapMap NYGC NWD119836 bc6e1fd7-229d-5e65-a5a2-a15fee0613c0 B-lymphocyte NA12878 8628f32f-d6fd-5419-a364-242a11abebb5 Normal - solid tissue WGS NA12878_sample 8ca820a2-d182-580c-9572-636f9f0eae62 sequence_upload spinnaker 1.1.2 cram NWD119836.recab.cram d5cd7954-3a9a-4877-830a-b0dc463d3373 204cf1bd-1477-57e6-880c-1b863edac627 s3://commons-dss-commons/blobs/16b8bb72660b1612bfc14c6967124daf36695fa2c48a85c027b1ae56b557f4b1.4c40c1c34375fec97bea30343436e9d39c64c696.7b21891b5cfa4e8982fd55816d966191-308.2837f9a7 dos://dos-dss.ucsc-cgp-dev.org/d5cd7954-3a9a-4877-830a-b0dc463d3373?version=2018-01-31T142527.540197Z +TOPMed HapMap Baylor NWD119844 ade6f774-2b64-5caa-a61f-593ab316cb66 B-lymphocyte NA12878 0c3a10a8-1b44-5b86-80e7-a242473b5470 Normal - solid tissue WGS NA12878_sample 0ff51e1f-2209-58af-9e87-7e57cf41df35 sequence_upload spinnaker 1.1.2 cram NWD119844.recab.cram 05c8f031-958b-4671-8e77-218a0b18d26d 06dfc2ab-2d04-52c3-9723-0ac4042e4e38 s3://commons-dss-commons/blobs/bb1e5b25eb820186fc35e08acb815add88e4581cac6ff8a28a432f08fa507b74.25b77b75ac1109e0027f68bbbb5505e5b50921dc.d0f4fa6ef638c6dc7eca6c077c48ba76-348.f03272c9 dos://dos-dss.ucsc-cgp-dev.org/05c8f031-958b-4671-8e77-218a0b18d26d?version=2018-01-31T142456.422327Z +TOPMed HapMap Baylor NWD119844 ade6f774-2b64-5caa-a61f-593ab316cb66 B-lymphocyte NA12878 0c3a10a8-1b44-5b86-80e7-a242473b5470 Normal - solid tissue WGS NA12878_sample 0ff51e1f-2209-58af-9e87-7e57cf41df35 sequence_upload spinnaker 1.1.2 crai NWD119844.recab.cram.crai 6079aaa4-71a1-41e1-9588-fded71219764 06dfc2ab-2d04-52c3-9723-0ac4042e4e38 s3://commons-dss-commons/blobs/a2c7634e5a9203219cc227fbb1583216ef934fb153e5711e1db117379f1ee8f8.bb6f1d0d5a033215e2f8ecdf78dbedf8958e081a.48c26ed3d4a8224f4dfa3515edae749b.d47f8df4 dos://dos-dss.ucsc-cgp-dev.org/6079aaa4-71a1-41e1-9588-fded71219764?version=2018-01-31T142457.109961Z +NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 cram NWD692354.b38.irc.v1.cram b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d s3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram,gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram dos://dos-dss.ucsc-cgp-dev.org/b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5?version=2018-02-28T160408.957538Z +TOPMed HapMap Broad NWD831422 bb2afc83-8980-53e3-a844-4c273b68bba4 B-lymphocyte NA12878 58ca49bc-fa95-5bf9-b547-552e2645a4cd Normal - blood WGS NA12878_sample 9d09c7c4-a078-5105-9493-26b0b4e40b79 sequence_upload spinnaker 1.1.2 cram NWD831422.recab.cram 91392f82-7529-4a5a-9ee6-1c33f5637332 0e727062-7fc9-5e46-b1e3-24537426ca4c s3://commons-dss-commons/blobs/1b57d910d63451195e04dffefea05d9c358ed0ecdf2c7dec089f1adb7b87242a.6a30d9b442338922707a0da64a22538f1a6a39aa.75bab7b53cf134670dbf42a3fdc4ea64-343.1187a558 dos://dos-dss.ucsc-cgp-dev.org/91392f82-7529-4a5a-9ee6-1c33f5637332?version=2018-01-31T082344.675433Z +TOPMed HapMap Broad NWD831422 bb2afc83-8980-53e3-a844-4c273b68bba4 B-lymphocyte NA12878 58ca49bc-fa95-5bf9-b547-552e2645a4cd Normal - blood WGS NA12878_sample 9d09c7c4-a078-5105-9493-26b0b4e40b79 sequence_upload spinnaker 1.1.2 crai NWD831422.recab.cram.crai db31d438-d369-48cc-aaa0-abc953c21213 0e727062-7fc9-5e46-b1e3-24537426ca4c s3://commons-dss-commons/blobs/30da18b02902ae9e5bb51768728064886e47f57573687b282e75d874cdc79ebf.f248ba5422244cf67632ef93420591b5d57d9e38.dcbe9d985c104fd240b9c307e4acf4c0.542f74e7 dos://dos-dss.ucsc-cgp-dev.org/db31d438-d369-48cc-aaa0-abc953c21213?version=2018-01-31T082415.483029Z +TOPMed HapMap UW NWD578417 a89a200d-e6f4-513a-b634-b1f7e10cf57a B-lymphocyte NA12878 dbe15848-e076-5517-a1c4-9d2cdbd6b4b5 Normal - blood derived WGS NA12878_sample bb6d255e-f3fa-5bb5-8333-872ff60e491e sequence_upload spinnaker 1.1.2 cram NWD578417.recab.cram 8a3fb043-0324-465c-b000-60c150dd68b1 139f30ba-62d3-50fb-9177-ab3d370e29f8 s3://commons-dss-commons/blobs/fb94baec68b9b6b9e10d0f61aa03e915314d24320533f72088a60c412e247c77.b07c72e12d6857b3093cddc871a7b0cadc1831fe.5d06e7da42fa89d8f447cbbdeb337e03-300.d55d6c31 dos://dos-dss.ucsc-cgp-dev.org/8a3fb043-0324-465c-b000-60c150dd68b1?version=2018-01-31T142516.543571Z +TOPMed HapMap NYGC NWD119836 bc6e1fd7-229d-5e65-a5a2-a15fee0613c0 B-lymphocyte NA12878 8628f32f-d6fd-5419-a364-242a11abebb5 Normal - solid tissue WGS NA12878_sample 8ca820a2-d182-580c-9572-636f9f0eae62 sequence_upload spinnaker 1.1.2 crai NWD119836.recab.cram.crai 975b0bf5-96f4-4e34-aa93-681e1d558b9c 204cf1bd-1477-57e6-880c-1b863edac627 s3://commons-dss-commons/blobs/597d139565b45176509fc7a4f3fc7066cac626ce50d80ad2a6b643eb0d9b4a5a.dda8235ca55396f9ab5cd2fb9e61d43dd796e7da.0ea01635527c738a5f3bf82acf0c3859.dcc419cd dos://dos-dss.ucsc-cgp-dev.org/975b0bf5-96f4-4e34-aa93-681e1d558b9c?version=2018-01-31T092315.854852Z diff --git a/test/test_bagitutils.py b/test/test_bagitutils.py index 73faf8e..0613587 100644 --- a/test/test_bagitutils.py +++ b/test/test_bagitutils.py @@ -2,56 +2,16 @@ import unittest import os -import pandas as pd -from StringIO import StringIO import zipfile -# import numpy as np from bagitutils import BagHandler -from pandas.util.testing import assert_frame_equal class TestBagHandlerMethods(unittest.TestCase): - def setUp(self): - """Load normalized test data into Pandas dataframe""" - fpath = 'test/test_normalize_df_mock.tsv' - try: - df = pd.read_csv( - fpath, - sep='\t') - except IOError: - print('Cannot open file') - self.normalized = df - - def test_normalize(self): - """ """ - # fpath = 'test/test_normalize_df_mock.tsv' - # df = pd.read_csv(fpath, sep='\t') - # args = dict([('data', df), - # ('bag_info', 'test'), - # ('bag_path', '~/dev/manifest-handover')]) - # bag = BagHandler(**args) - # df_test = bag._BagHandler__normalize() - # assert_frame_equal(self.normalized, df) - - def test_worksWithString(self): - s = StringIO("Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URL\n\ - NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 crai NWD692354.b38.irc.v1.cram.crai 5a00cc38-2f8d-4d34-98e0-0a847579b988 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram.crai', u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram.crai'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/5a00cc38-2f8d-4d34-98e0-0a847579b988?version=2018-02-28T160411.061319Z\n\ - NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 cram NWD692354.b38.irc.v1.cram b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram', u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5?version=2018-02-28T160408.957538Z\n") - pd.read_csv(s, sep='\t') - - def test_bagHandler(self): - s = StringIO("Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URL\n\ - NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 crai NWD692354.b38.irc.v1.cram.crai 5a00cc38-2f8d-4d34-98e0-0a847579b988 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram.crai', u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram.crai'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/5a00cc38-2f8d-4d34-98e0-0a847579b988?version=2018-02-28T160411.061319Z\n\ - NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 cram NWD692354.b38.irc.v1.cram b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram', u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5?version=2018-02-28T160408.957538Z\n") - bag = BagHandler(data=s, bag_info={}, bag_name='manifest') - zip_name = bag.create_bag() - os.remove(zip_name) - def test_zipRootIsManifest(self): - s = StringIO("Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URL\n\ + s = "Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URL\n\ NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 crai NWD692354.b38.irc.v1.cram.crai 5a00cc38-2f8d-4d34-98e0-0a847579b988 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram.crai', u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram.crai'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/5a00cc38-2f8d-4d34-98e0-0a847579b988?version=2018-02-28T160411.061319Z\n\ - NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 cram NWD692354.b38.irc.v1.cram b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram', u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5?version=2018-02-28T160408.957538Z\n") + NHLBI TOPMed: Whole Genome Sequencing and Related Phenotypes in the Framingham Heart Study Framingham Broad 20428 09df7aef-246a-57eb-9685-e1d4d18b55ab BLOOD SRS1353998 2923638f-0784-5704-8d93-5b97b4ca3092 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome NWD692354 dd8337dd-f731-5c3b-9a03-bdae77ca47a9 alignment topmed-spinnaker Alpha Build 1 cram NWD692354.b38.irc.v1.cram b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5 50cfaf90-0998-5ef5-aa0b-cfaea71d5a7d [u's3://nih-nhlbi-datacommons/NWD692354.b38.irc.v1.cram', u'gs://topmed-irc-share/genomes/NWD692354.b38.irc.v1.cram'] dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/b4cf8998-34a1-4e00-aa23-bcdf8d6b23b5?version=2018-02-28T160408.957538Z\n" bag = BagHandler(data=s, bag_info={}, bag_name='manifest') zip_name = bag.create_bag() with zipfile.ZipFile(zip_name) as myzip: @@ -61,6 +21,62 @@ def test_zipRootIsManifest(self): self.assertIn('data/', name) os.remove(zip_name) + def testDemoData(self): + data = ("Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URI\n\ +NIH Data Commons NIH Data Commons Pilot Broad Public Datasets ABC123456 c2b4c298-4d80-4aaa-bddf-20c15d184af3 Blood NA12878_2 bfcc3266-340a-5751-8db1-d661163ac8e5 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome H06JUADXX130110_1 c774934f-4100-44bf-8df9-8d4e509c088d none test workflow Development bam H06JUADXX130110.1.ATCACGAT.20k_reads.bam 60936d97-6358-4ce3-8136-d5776186ee21 dd04fbf3-2a51-4c72-8038-da7094b8da55 gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/60936d97-6358-4ce3-8136-d5776186ee21?version=2018-03-23T123738.145535Z") + bag = BagHandler(data=data, bag_info={}, bag_name='manifest') + (participants, sample) = bag.convert_to_participant_and_sample() + self.assertListEqual( + sorted(['c2b4c298-4d80-4aaa-bddf-20c15d184af3']), + sorted(participants)) + self.assertEquals(len(sample), 1) + row = sample[0] + self.assertEquals(row['participant'], 'c2b4c298-4d80-4aaa-bddf-20c15d184af3') + self.assertEquals(row['gs_url1'], 'gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam') + self.assertFalse('s3_url1' in row) + + def testWriteCsv(self): + data = ("Program Project Center Name Submitter Donor ID Donor UUID Submitter Donor Primary Site Submitter Specimen ID Specimen UUID Submitter Specimen Type Submitter Experimental Design Submitter Sample ID Sample UUID Analysis Type Workflow Name Workflow Version File Type File Path Upload File ID Data Bundle UUID Metadata.json File URLs File DOS URI\n\ + NIH Data Commons NIH Data Commons Pilot Broad Public Datasets ABC123456 c2b4c298-4d80-4aaa-bddf-20c15d184af3 Blood NA12878_2 bfcc3266-340a-5751-8db1-d661163ac8e5 Normal - Blood Seq_DNA_SNP_CNV; Seq_DNA_WholeGenome H06JUADXX130110_1 c774934f-4100-44bf-8df9-8d4e509c088d none test workflow Development bam H06JUADXX130110.1.ATCACGAT.20k_reads.bam 60936d97-6358-4ce3-8136-d5776186ee21 dd04fbf3-2a51-4c72-8038-da7094b8da55 gs://broad-public-datasets/NA12878_downsampled_for_testing/unmapped/H06JUADXX130110.1.ATCACGAT.20k_reads.bam dos://dos-dss.ucsc-cgp-dev.org/ga4gh/dos/v1/dataobjects/60936d97-6358-4ce3-8136-d5776186ee21?version=2018-03-23T123738.145535Z") + bag = BagHandler(data=data, bag_info={}, bag_name='manifest') + zip_name = bag.create_bag() + with zipfile.ZipFile(zip_name) as myzip: + for name in myzip.namelist(): + if 'sample' in name: + sample = myzip.open(name) + row = sample.read() + sampleid = 'entity:sample_id' + self.assertEqual(sampleid, row[:len(sampleid)]) + + os.remove(zip_name) + + def test_process_demo_data(self): + with open('test/manifest_with_crai_cram_bai.tsv', 'r') as tsv: + lines = tsv.readlines() + data = "\n".join(lines) + bag = BagHandler(data=data, bag_info={}, bag_name='manifest') + participants, max_files_in_sample, protocols = bag.participants_and_max_files_in_sample_and_protocols() + self.assertEqual(len(participants), 13) + self.assertEqual(len(protocols), 2) + self.assertEqual(max_files_in_sample, 4) + samples = bag.samples(max_files_in_sample, protocols) + self.assertEqual(len(samples), 13) + + # Ensure every row has file_dos_uri column + for suffix in [str(i) for i in range(1, max_files_in_sample + 1)]: + for i in range(0, len(samples)): + self.assertIn('file_dos_uri' + suffix, samples[i].keys()) + + first_row_keys = sorted(samples[0]) + for i in range(0, len(samples)): + # Ensure all rows have the same keys + self.assertListEqual(first_row_keys, sorted(samples[i].keys())) + # Ensure there is no column with a 0 (zero) in its name (there was in + # the past before this test was written -- make sure it doesn't creep + # back in. + for key in first_row_keys: + self.assertNotIn('0', key) + if __name__ == '__main__': unittest.main() diff --git a/webservice.py b/webservice.py index cfbfa4a..99ed805 100644 --- a/webservice.py +++ b/webservice.py @@ -391,7 +391,7 @@ def export_to_firecloud(): 'data_type': 'TOPMed', 'date_created': datetime.datetime.now().isoformat()} # Instantiate bag object. - bag = BagHandler(data=StringIO(response_obj.get_data()), + bag = BagHandler(data=response_obj.get_data(), bag_info=bag_info, bag_name=bag_name) # Pathname of compressed bag. @@ -402,11 +402,9 @@ def export_to_firecloud(): fc_lambda_protocol = os.getenv("FC_LAMBDA_PROTOCOL", "https") fc_lambda_domain = os.getenv("FC_LAMBDA_DOMAIN", domain) fc_lambda_port = os.getenv("FC_LAMBDA_PORT", '443') - url = (fc_lambda_protocol + - '://' + fc_lambda_domain + - ':' + fc_lambda_port + - '/api/exportBag?workspace=' + workspace + - '&namespace=' + namespace) + url = '{}://{}:{}/api/exportBag?workspace={}&namespace={}'.format( + fc_lambda_protocol, fc_lambda_domain, fc_lambda_port, + workspace, namespace) logger.info("going to hit {}".format(url)) headers = {'Content-Type': 'application/octet-stream', 'Accept': 'application/json',