From d94b41284aa90a3a75c1e64b33c8476d4a3e4572 Mon Sep 17 00:00:00 2001 From: Steen Hoyer Date: Wed, 8 Feb 2017 18:24:21 -0600 Subject: [PATCH 1/3] Indent line on dumping--include it in 'hint' block --- docs/creation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/creation.rst b/docs/creation.rst index b88a80b..90c0142 100644 --- a/docs/creation.rst +++ b/docs/creation.rst @@ -75,7 +75,7 @@ one Study section structured as defined by the `ISA-Tab Specification `_. .. hint:: Remember that when you ``dump()`` ISA content, you do it on the ``Investigation`` object. This means any -``Study`` and ``Assay`` objects and content must be attached to the ``Investigation`` for it to be serialized out. + ``Study`` and ``Assay`` objects and content must be attached to the ``Investigation`` for it to be serialized out. Different classes in ``isatools.model.v1`` have class constructors and instance variables that roughly map to the ISA Abstract Model. For full details of how to instantiate model classes, access and manipulate ISA data as objects, From 1f576702c7771e587d318f7454d9d6081421fe71 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Mon, 13 Feb 2017 16:02:09 +0000 Subject: [PATCH 2/3] Tweaks to process key generator --- isatools/isatab.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/isatools/isatab.py b/isatools/isatab.py index 892790a..4bc626d 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -3153,43 +3153,37 @@ def get_contacts(section_df): return investigation -def process_keygen(protocol_ref, column_group, object_label_index, all_columns, series, series_index): - +def process_keygen(protocol_ref, column_group, object_label_index, all_columns, series, series_index, DF): process_key = protocol_ref - - node_key = None - node_cols = [i for i, c in enumerate(all_columns) if c in _LABELS_MATERIAL_NODES + _LABELS_DATA_NODES] - + input_node_value = '' + output_node_value = '' output_node_index = find_gt(node_cols, object_label_index) - if output_node_index > -1: - output_node_label = all_columns[output_node_index] output_node_value = series[output_node_label] - node_key = output_node_value - input_node_index = find_lt(node_cols, object_label_index) - if input_node_index > -1: - input_node_label = all_columns[input_node_index] input_node_value = series[input_node_label] + input_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[input_node_index]]].drop_duplicates() + output_nodes_with_prot_keys = DF[[all_columns[object_label_index], all_columns[output_node_index]]].drop_duplicates() + + if len(input_nodes_with_prot_keys) > len(output_nodes_with_prot_keys): + node_key = output_node_value + else: node_key = input_node_value if process_key == protocol_ref: - process_key += '-' + str(series_index) name_column_hits = [n for n in column_group if n in _LABELS_ASSAY_NODES] - if len(name_column_hits) == 1: process_key = series[name_column_hits[0]] else: pv_cols = [c for c in column_group if c.startswith('Parameter Value[')] - if len(pv_cols) > 0: # 2. else try use protocol REF + Parameter Values as key if node_key is not None: @@ -3624,7 +3618,7 @@ def get_node_by_label_and_key(l, k): protocol_ref = object_series[column_group[0]] - process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _) + process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF) try: process = processes[process_key] @@ -3705,7 +3699,7 @@ def get_node_by_label_and_key(l, k): protocol_ref = object_series[column_group[0]] - process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _) + process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF) process_key_sequence.append(process_key) From aa93d2bc818e1f559c9036ebe40359a34180b729 Mon Sep 17 00:00:00 2001 From: David Johnson Date: Wed, 15 Feb 2017 10:10:56 +0000 Subject: [PATCH 3/3] Various work towards #184 and #185 --- isatools/io/isatab_parser.py | 80 +-- isatools/isajson.py | 20 +- isatools/isatab.py | 909 +++++++++++++++-------------------- isatools/model/v1.py | 14 +- isatools/utils.py | 12 +- tests/test_isatab.py | 5 +- tests/test_isatab2json.py | 16 +- tests/test_isatools_utils.py | 2 +- 8 files changed, 440 insertions(+), 618 deletions(-) diff --git a/isatools/io/isatab_parser.py b/isatools/io/isatab_parser.py index 5f12e67..558b4e7 100644 --- a/isatools/io/isatab_parser.py +++ b/isatools/io/isatab_parser.py @@ -227,25 +227,20 @@ def __init__(self, base_file): "Comment", "Label", "Material Type", "Factor Value"), "node": ("Sample Name", "Source Name", "Image File", "Raw Data File", "Derived Data File", "Acquisition Parameter Data File", - "Extract Name", "Labeled Extract Name"), + "Extract Name", "Labeled Extract Name", "Array Data File", + "Raw Spectral Data File", "Protein Assignment File", "Peptide Assignment File", + "Post Translational Modification Assignment File", "Derived Spectral Data File", + "Derived Array Data File"), "node_assay": ("Assay Name", "Data Transformation Name", "Normalization Name"), "processing": "Protocol REF", "parameter": ("Parameter Value", "Array Design REF") } - self._synonyms = {"Array Data File": "Raw Data File", - "Free Induction Decay Data File": "Raw Data File", - "Derived Array Data File" : "Derived Data File", + self._synonyms = { "Hybridization Assay Name": "Assay Name", "Scan Name": "Assay Name", - "Array Data Matrix File": "Derived Data File", - "Derived Array Data Matrix File": "Derived Data File", - "Raw Spectral Data File": "Raw Data File", - "Derived Spectral Data File": "Derived Data File", - "MS Assay Name": "Assay Name", - "Protein Assignment File": "Derived Data File", - "Peptide Assignment File": "Derived Data File", - "Post Translational Modification Assignment File": "Derived Data File"} + "MS Assay Name": "Assay Name" + } def parse(self, rec): """Retrieve row data from files associated with the ISATabRecord. @@ -620,30 +615,45 @@ def _swap_synonyms(self, header): def _build_node_index(self, type, name): if type=="Source Name": return "source-"+name + elif type == "Sample Name": + return "sample-"+name + elif type == "Extract Name": + return "extract-"+name + elif type == "Labeled Extract Name": + return "labeledextract-"+name + elif type == "Raw Data File": + return "rawdatafile-"+name + elif type=="Derived Data File": + return "deriveddatafile-"+name + elif type=="Acquisition Parameter Data File": + return "acquisitionparameterfile-"+name + elif type=="Image File": + return "imagefile-"+name + elif type == "Array Data File": + return "arraydatafile-" + name + elif type == "Array Data Matrix File": + return "arraydatamatrixfile-" + name + elif type == "Derived Array Data Matrix File": + return "derivedarraydatamatrixfile-" + name + elif type == "Raw Spectral Data File": + return "rawspectraldatafile-" + name + elif type == "Derived Array Data Matrix File": + return "derivedarraydatamatrixfile-" + name + elif type == "Protein Assignment File": + return "proteinassignmentfile-" + name + elif type == "Peptide Assignment File": + return "peptideassignmentfile-" + name + elif type == "Post Translational Modification Assignment File": + return "posttranslationalmodificationassignmentfile-" + name + elif type == "Free Induction Decay Data File": + return "freeinductiondecaydatafile-" + name + elif type == "Derived Array Data File": + return "derivedarraydatafile-" + name + elif type == "Derived Spectral Data File": + return "derivedspectraldatafile-" + name else: - if type == "Sample Name": - return "sample-"+name - else: - if type == "Extract Name": - return "extract-"+name - else: - if type == "Labeled Extract Name": - return "labeledextract-"+name - else: - if type == "Raw Data File": - return "rawdatafile-"+name - else: - if type=="Derived Data File": - return "deriveddatafile-"+name - else: - if type=="Acquisition Parameter Data File": - return "acquisitionparameterfile-"+name - else: - if type=="Image File": - return "imagefile-"+name - else: - "ERROR - Type not being considered! ", type - return name + print("ERROR - Type not being considered! ", type) + return name _record_str = \ diff --git a/isatools/isajson.py b/isatools/isajson.py index af8df4e..8fad1e3 100644 --- a/isatools/isajson.py +++ b/isatools/isajson.py @@ -44,23 +44,6 @@ def get_roles(j): roles.append(role) return roles - def _build_assay_graph(process_sequence=list()): - G = nx.DiGraph() - for process in process_sequence: - if process.next_process is not None or len(process.outputs) > 0: # first check if there"s some valid outputs to connect - if len(process.outputs) > 0: - for output in [n for n in process.outputs if not isinstance(n, DataFile)]: - G.add_edge(process, output) - else: # otherwise just connect the process to the next one - G.add_edge(process, process.next_process) - if process.prev_process is not None or len(process.inputs) > 0: - if len(process.inputs) > 0: - for input_ in process.inputs: - G.add_edge(input_, process) - else: - G.add_edge(process.prev_process, process) - return G - def get_jvalue(dict, key): if key in dict.keys(): return dict[key] @@ -1760,7 +1743,8 @@ def get_data_file(o): { "@id": id_gen(o), "name": o.filename, - "type": o.label + "type": o.label, + "comments": get_comments(o.comments) } ) diff --git a/isatools/isatab.py b/isatools/isatab.py index 4bc626d..6a4d0ca 100644 --- a/isatools/isatab.py +++ b/isatools/isatab.py @@ -47,7 +47,7 @@ 'Data Transformation Name', 'Normalization Name'] -def dump(isa_obj, output_path, i_file_name='i_investigation.txt'): +def dump(isa_obj, output_path, i_file_name='i_investigation.txt', skip_dump_tables=False): def _build_roles_str(roles): if roles is None: @@ -364,11 +364,19 @@ def _build_publications_section_df(prefix='Investigation', publications=list()): component_types = component_types[:-1] component_types_accession_numbers = component_types_accession_numbers[:-1] component_types_source_refs = component_types_source_refs[:-1] + protocol_type_term = '' + protocol_type_term_accession = '' + protocol_type_term_source_name = '' + if protocol.protocol_type: + protocol_type_term = protocol.protocol_type.term + protocol_type_term_accession = protocol.protocol_type.term_accession + if protocol.protocol_type.term_source: + protocol_type_term_source_name = protocol.protocol_type.term_source.name study_protocols_df.loc[i] = [ protocol.name, - protocol.protocol_type.term, - protocol.protocol_type.term_accession, - protocol.protocol_type.term_source.name if protocol.protocol_type.term_source else '', + protocol_type_term, + protocol_type_term_accession, + protocol_type_term_source_name, protocol.description, protocol.uri, protocol.version, @@ -390,8 +398,11 @@ def _build_publications_section_df(prefix='Investigation', publications=list()): fp.write('STUDY CONTACTS\n') study_contacts_df.to_csv(path_or_buf=fp, mode='a', sep='\t', encoding='utf-8', index_label='Study Person Last Name') - write_study_table_files(investigation, output_path) - write_assay_table_files(investigation, output_path) + if skip_dump_tables: + pass + else: + write_study_table_files(investigation, output_path) + write_assay_table_files(investigation, output_path) fp.close() return investigation @@ -433,8 +444,6 @@ def _longest_path_and_attrs(paths): longest = (length, path) return longest[1] -prev = '' # used in rolling_group(val) in write_assay_table_files(inv_obj, output_dir) - def _all_end_to_end_paths(G, start_nodes, end_nodes): paths = [] @@ -457,28 +466,13 @@ def _all_end_to_end_paths(G, start_nodes, end_nodes): paths += list(nx.algorithms.all_simple_paths(G, derives_from_node, end_node)) start_nodes_processed.append(derives_from_node) end_nodes_processed.append(end_node) - start_nodes_remaining = [item for item in start_nodes if item not in start_nodes_processed] end_nodes_remaining = [item for item in end_nodes if item not in end_nodes_processed] - print("{} start nodes and {} end nodes not processed, trying reverse traversal...".format(len(start_nodes_remaining), len(end_nodes_remaining))) - for end_node in end_nodes_remaining: - if isinstance(end_node, Process): - cur_node = end_node - while cur_node.prev_process: - cur_node = cur_node.prev_process - if len(cur_node.inputs) > 0: - for input_node in cur_node.inputs: - paths += list(nx.algorithms.all_simple_paths(G, input_node, end_node)) - start_nodes_processed.append(input_node) - end_nodes_processed.append(end_node) - else: - paths += list(nx.algorithms.all_simple_paths(G, cur_node, end_node)) - start_nodes_processed.append(cur_node) - end_nodes_processed.append(end_node) - elif isinstance(end_node, Sample): - processes_linked_to_sample = [p for p in G.nodes() if isinstance(p, Process) and end_node in p.outputs] - for process in processes_linked_to_sample: - cur_node = process + if len(start_nodes_remaining) + len(end_nodes_remaining) > 0: + print("{} start nodes and {} end nodes not processed, trying reverse traversal...".format(len(start_nodes_remaining), len(end_nodes_remaining))) + for end_node in end_nodes_remaining: + if isinstance(end_node, Process): + cur_node = end_node while cur_node.prev_process: cur_node = cur_node.prev_process if len(cur_node.inputs) > 0: @@ -490,369 +484,30 @@ def _all_end_to_end_paths(G, start_nodes, end_nodes): paths += list(nx.algorithms.all_simple_paths(G, cur_node, end_node)) start_nodes_processed.append(cur_node) end_nodes_processed.append(end_node) + elif isinstance(end_node, Sample): + processes_linked_to_sample = [p for p in G.nodes() if isinstance(p, Process) and end_node in p.outputs] + for process in processes_linked_to_sample: + cur_node = process + while cur_node.prev_process: + cur_node = cur_node.prev_process + if len(cur_node.inputs) > 0: + for input_node in cur_node.inputs: + paths += list(nx.algorithms.all_simple_paths(G, input_node, end_node)) + start_nodes_processed.append(input_node) + end_nodes_processed.append(end_node) + else: + paths += list(nx.algorithms.all_simple_paths(G, cur_node, end_node)) + start_nodes_processed.append(cur_node) + end_nodes_processed.append(end_node) start_nodes_remaining = [item for item in start_nodes if item not in start_nodes_processed] end_nodes_remaining = [item for item in end_nodes if item not in end_nodes_processed] - print("{} start nodes and {} end nodes not processed, trying brute force...".format(len(start_nodes_remaining), len(end_nodes_remaining))) - for start, end in itertools.product(start_nodes_remaining, end_nodes_remaining): - paths += list(nx.algorithms.all_simple_paths(G, start, end)) + if len(start_nodes_remaining) + len(end_nodes_remaining) > 0: + print("{} start nodes and {} end nodes not processed, trying brute force...".format(len(start_nodes_remaining), len(end_nodes_remaining))) + for start, end in itertools.product(start_nodes_remaining, end_nodes_remaining): + paths += list(nx.algorithms.all_simple_paths(G, start, end)) return paths -KEY_POSTFIX_UNIT = '_unit' -KEY_POSTFIX_TERMSOURCE = '_termsource' -KEY_POSTFIX_TERMACCESSION = '_termaccession' -LABEL_UNIT = 'Unit' -LABEL_TERM_SOURCE = 'Term Source REF' -LABEL_TERM_ACCESSION = 'Term Accession Number' -LABEL_PROTOCOL_REF = 'Protocol REF' - - -def _fv_label(factor_name): return 'Factor Value[' + factor_name + ']' - - -def _charac_label(charac_type_name): return 'Characteristics[' + charac_type_name + ']' - - -def _set_charac_cols(prefix, characteristics, cols, col_map): - for c in sorted(characteristics, key=lambda x: id(x.category)): - obj_charac_key = prefix + '_char[' + c.category.term + ']' - cols.append(obj_charac_key) - col_map[obj_charac_key] = _charac_label(c.category.term) - if isinstance(c.value, (int, float)): - cols.extend((obj_charac_key + KEY_POSTFIX_UNIT, - obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE, - obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_charac_key + KEY_POSTFIX_UNIT] = LABEL_UNIT - col_map[obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - elif isinstance(c.value, OntologyAnnotation): - cols.extend((obj_charac_key + KEY_POSTFIX_TERMSOURCE, - obj_charac_key + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_charac_key + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_charac_key + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - - -def _set_charac_vals(prefix, characteristics, df, i): - for c in sorted(characteristics, key=lambda x: id(x.category)): - obj_charac_key = prefix + '_char[' + c.category.term + ']' - df.loc[i, obj_charac_key] = c.value - if isinstance(c.value, int) or isinstance(c.value, float): - df.loc[i, obj_charac_key + KEY_POSTFIX_UNIT] = c.unit.term - df.loc[i, obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE] = c.unit.term_source.name if c.unit.term_source else '' - df.loc[i, obj_charac_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION] = c.unit.term_accession - elif isinstance(c.value, OntologyAnnotation): - df.loc[i, obj_charac_key] = c.value.term - df.loc[i, obj_charac_key + KEY_POSTFIX_TERMSOURCE] = c.value.term_source.name if c.value.term_source else '' - df.loc[i, obj_charac_key + KEY_POSTFIX_TERMACCESSION] = c.value.term_accession - - -def _set_factor_value_cols(prefix, factor_values, cols, col_map): - for fv in sorted(factor_values, key=lambda x: id(x.factor_name)): - obj_fv_key = prefix + '_fv[' + fv.factor_name.name + ']' - cols.append(obj_fv_key) - col_map[obj_fv_key] = _fv_label(fv.factor_name.name) - if isinstance(fv.value, int) or isinstance(fv.value, float): - cols.extend((obj_fv_key + KEY_POSTFIX_UNIT, - obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE, - obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_fv_key + KEY_POSTFIX_UNIT] = LABEL_UNIT - col_map[obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - elif isinstance(fv.value, OntologyAnnotation): - cols.extend((obj_fv_key + KEY_POSTFIX_TERMSOURCE, - obj_fv_key + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_fv_key + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_fv_key + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - - -def _set_factor_value_vals(prefix, factor_values, df, i): - for fv in sorted(factor_values, key=lambda x: id(x.factor_name)): - obj_fv_key = prefix + '_fv[' + fv.factor_name.name + ']' - df.loc[i, obj_fv_key] = fv.value - if isinstance(fv.value, int) or isinstance(fv.value, float): - df.loc[i, obj_fv_key + KEY_POSTFIX_UNIT] = fv.unit.term - df.loc[i, obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE] = fv.unit.term_source.name if fv.unit.term_source else '' - df.loc[i, obj_fv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION] = fv.unit.term_accession - elif isinstance(fv.value, OntologyAnnotation): - df.loc[i, obj_fv_key] = fv.value.term - df.loc[i, obj_fv_key + KEY_POSTFIX_TERMSOURCE] = fv.value.term_source.name if fv.value.term_source else '' - df.loc[i, obj_fv_key + KEY_POSTFIX_TERMACCESSION] = fv.value.term_accession - -KEY_POSTFIX_DATE = '_date' -LABEL_DATE = 'Date' -KEY_POSTFIX_PERFORMER = '_performer' -LABEL_PERFORMER = 'Performer' - - -def _parameter_value_label(parameter_name): return 'Parameter Value[' + parameter_name + ']' - - -def _set_protocol_cols(protrefcount, prottypes, process, cols, col_map): - obj_process_key = 'protocol[' + str(protrefcount) + ']' - cols.append(obj_process_key) - col_map[obj_process_key] = LABEL_PROTOCOL_REF - if process.date is not None: - cols.append(obj_process_key + KEY_POSTFIX_DATE) - col_map[obj_process_key + KEY_POSTFIX_DATE] = LABEL_DATE - if process.performer is not None: - cols.append(obj_process_key + KEY_POSTFIX_PERFORMER) - col_map[obj_process_key + KEY_POSTFIX_PERFORMER] = LABEL_PERFORMER - for pv in reversed(sorted(process.parameter_values, key=lambda x: x.category.parameter_name.name)): - obj_process_pv_key = '_pv[' + pv.category.parameter_name.name + ']' - if isinstance(pv.value, int) or isinstance(pv.value, float): - cols.extend((obj_process_key + obj_process_pv_key, - obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT, - obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE, - obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_process_key + obj_process_pv_key] = _parameter_value_label(pv.category.parameter_name.name) - col_map[obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT] = LABEL_UNIT - col_map[obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_process_key + obj_process_pv_key + KEY_POSTFIX_UNIT + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - elif isinstance(pv.value, OntologyAnnotation): - cols.extend((obj_process_key + obj_process_pv_key, - obj_process_key + obj_process_pv_key + KEY_POSTFIX_TERMSOURCE, - obj_process_key + obj_process_pv_key + KEY_POSTFIX_TERMACCESSION)) - col_map[obj_process_key + obj_process_pv_key] = _parameter_value_label(pv.category.parameter_name.name) - col_map[obj_process_key + obj_process_pv_key + KEY_POSTFIX_TERMSOURCE] = LABEL_TERM_SOURCE - col_map[obj_process_key + obj_process_pv_key + KEY_POSTFIX_TERMACCESSION] = LABEL_TERM_ACCESSION - else: - cols.append(obj_process_key + obj_process_pv_key) - col_map[obj_process_key + obj_process_pv_key] = _parameter_value_label(pv.category.parameter_name.name) - for output in [x for x in process.outputs if isinstance(x, DataFile)]: - cols.append('data[' + output.label + ']') - col_map['data[' + output.label + ']'] = output.label - for comment in output.comments: - cols.append('data[' + output.label + ']_comment[' + comment.name + ']') - col_map['data[' + output.label + ']_comment[' + comment.name + ']'] = 'Comment[' + comment.name + ']' - if process.executes_protocol.protocol_type.name not in prottypes.values(): - prottypes[protrefcount] = process.executes_protocol.protocol_type.name - protrefcount += 1 - - -def write_assay_table_files(inv_obj, output_dir): - """ - Writes out assay table files according to pattern defined by - - Sample Name, - Protocol Ref: 'sample collection', [ ParameterValue[], ... ], - Material Name, [ Characteristics[], ... ] - [ FactorValue[], ... ] - - - """ - if isinstance(inv_obj, Investigation): - for study_obj in inv_obj.studies: - for assay_obj in study_obj.assays: - if assay_obj.graph is None: break - cols = list() - mcount = 0 - protrefcount = 0 - protnames = dict() - col_map = dict() - start_nodes, end_nodes = _get_start_end_nodes(assay_obj.graph) - paths = _all_end_to_end_paths(assay_obj.graph, start_nodes, end_nodes) - for node in _longest_path_and_attrs(paths): - if isinstance(node, Sample): - cols.append('sample') - col_map['sample'] = 'Sample Name' - elif isinstance(node, Material): - if node.type == 'Labeled Extract Name': - cols.append('lextract') - cols.append('lextract_label') - cols.append('lextract_label_termsource') - cols.append('lextract_label_termaccession') - col_map['lextract'] = 'Labeled Extract Name' - col_map['lextract_label'] = 'Label' - col_map['lextract_label_termsource'] = 'Term Source REF' - col_map['lextract_label_termaccession'] = 'Term Accession Number' - elif node.type == 'Extract Name': - cols.append('extract') - col_map['extract'] = 'Extract Name' - _set_charac_cols('extract', node.characteristics, cols, col_map) - else: - cols.append('material[' + str(mcount) + ']') - col_map['material[' + str(mcount) + ']'] = 'Material Name' - _set_charac_cols('material', node.characteristics, cols, col_map) - mcount += 1 - elif isinstance(node, Process): - cols.append('protocol[' + str(protrefcount) + ']') - col_map['protocol[' + str(protrefcount) + ']'] = 'Protocol REF' - if node.date is not None: - cols.append('protocol[' + str(protrefcount) + ']_date') - col_map['protocol[' + str(protrefcount) + ']_date'] = 'Date' - if node.performer is not None: - cols.append('protocol[' + str(protrefcount) + ']_performer') - col_map['protocol[' + str(protrefcount) + ']_performer'] = 'Performer' - for pv in reversed(sorted(node.parameter_values, key=lambda x: x.category.parameter_name.term)): - if isinstance(pv.value, int) or isinstance(pv.value, float): - cols.extend(('protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']', - 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit', - 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termsource', - 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termaccession')) - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = 'Parameter Value[' + pv.category.parameter_name.term + ']' - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit'] = 'Unit' - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termsource'] = 'Term Source REF' - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termaccession'] = 'Term Accession Number' - elif isinstance(pv.value, OntologyAnnotation): - cols.extend(('protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']', - 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termsource', - 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termaccession',)) - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = 'Parameter Value[' + pv.category.parameter_name.term + ']' - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termsource'] = 'Term Source REF' - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termaccession'] = 'Term Accession Number' - else: - cols.append('protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']',) - col_map['protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = 'Parameter Value[' + pv.category.parameter_name.term + ']' - if node.executes_protocol.protocol_type.term == 'nucleic acid sequencing': - cols.append('protocol[' + str(protrefcount) + ']_prop[' + 'Assay Name' + ']') - col_map['protocol[' + str(protrefcount) + ']_prop[' + 'Assay Name' + ']'] = 'Assay Name' - elif node.executes_protocol.protocol_type.term == 'nucleic acid hybridization': - cols.append('protocol[' + str(protrefcount) + ']_prop[' + 'Hybridization Assay Name' + ']') - col_map['protocol[' + str(protrefcount) + ']_prop[' + 'Hybridization Assay Name' + ']'] = 'Hybridization Assay Name' - cols.append('protocol[' + str(protrefcount) + ']_prop[' + 'Array Design REF' + ']') - col_map['protocol[' + str(protrefcount) + ']_prop[' + 'Array Design REF' + ']'] = 'Array Design REF' - elif node.executes_protocol.protocol_type.term == 'data collection': - cols.append('protocol[' + str(protrefcount) + ']_prop[' + 'Scan Name' + ']') - col_map['protocol[' + str(protrefcount) + ']_prop[' + 'Scan Name' + ']'] = 'Scan Name' - elif node.executes_protocol.protocol_type.term == 'mass spectrometry': - cols.append('protocol[' + str(protrefcount) + ']_prop[' + 'MS Assay Name' + ']') - col_map['protocol[' + str(protrefcount) + ']_prop[' + 'MS Assay Name' + ']'] = 'MS Assay Name' - - for output in [x for x in node.outputs if isinstance(x, DataFile)]: - cols.append('data[' + output.label + ']') - col_map['data[' + output.label + ']'] = output.label - if output.comments: - for comment in output.comments: - cols.append('data[' + output.label + ']_comment[' + comment.name + ']') - col_map['data[' + output.label + ']_comment[' + comment.name + ']'] = 'Comment[' + comment.name + ']' - if node.executes_protocol.name not in protnames.keys(): - protnames[node.executes_protocol.name] = protrefcount - protrefcount += 1 - # protrefcount = _set_protocol_cols(protrefcount, prottypes, node, cols, col_map) - elif isinstance(node, DataFile): - pass # we process DataFile above inside Process - import pandas as pd - df = pd.DataFrame(columns=cols) - i = 0 - for path in paths: - mcount = 0 - compound_key = str() - for node in path: - if isinstance(node, Sample): - df.loc[i, 'sample'] = node.name - compound_key += node.name + '/' - elif isinstance(node, Material): - if node.type == 'Labeled Extract Name': - df.loc[i, 'lextract'] = node.name - compound_key += node.name + '/' - df.loc[i, 'lextract_label'] = node.characteristics[0].value.term - df.loc[i, 'lextract_label_termsource'] = node.characteristics[0].value.term_source.name if node.characteristics[0].value.term_source else '' - df.loc[i, 'lextract_label_termaccession'] = node.characteristics[0].value.term_accession - elif node.type == 'Extract Name': - df.loc[i, 'extract'] = node.name - compound_key += node.name + '/' - _set_charac_vals('extract', node.characteristics, df, i) - else: - df.loc[i, 'material[' + str(mcount) + ']'] = node.name - compound_key += node.name + '/' - _set_charac_vals('material', node.characteristics, df, i) - mcount += 1 - elif isinstance(node, Process): - def find(n): - v = 0 - for k, v in protnames.items(): - if k == n.executes_protocol.name: - return v - return v - protrefcount = find(node) - df.loc[i, 'protocol[' + str(protrefcount) + ']'] = node.executes_protocol.name - compound_key += str(protrefcount) + '/' + node.name + '/' - if node.date is not None: - df.loc[i, 'protocol[' + str(protrefcount) + ']_date'] = node.date - if node.performer is not None: - df.loc[i, 'protocol[' + str(protrefcount) + ']_performer'] = node.performer - for pv in reversed(sorted(node.parameter_values, key=lambda x: x.category.parameter_name.term)): - if isinstance(pv.value, int) or isinstance(pv.value, float): - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = pv.value - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit'] = pv.unit.term - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termsource'] = pv.unit.term_source.name if pv.unit.term_source else '' - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_unit_termaccession'] = pv.unit.term_accession - elif isinstance(pv.value, OntologyAnnotation): - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = pv.value.term - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termsource'] = pv.value.term_source.name if pv.value.term_source else '' - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']_termaccession'] = pv.value.term_accession - else: - df.loc[i, 'protocol[' + str(protrefcount) + ']_pv[' + pv.category.parameter_name.term + ']'] = pv.value - if node.executes_protocol.protocol_type.term == 'nucleic acid sequencing': - df.loc[i, 'protocol[' + str(protrefcount) + ']_prop[' + 'Assay Name' + ']'] = node.name - compound_key += str(protrefcount) + '/' + 'Assay Name' + '/' + node.name - elif node.executes_protocol.protocol_type.term == 'nucleic acid hybridization': - df.loc[i, 'protocol[' + str(protrefcount) + ']_prop[' + 'Hybridization Assay Name' + ']'] = node.name - compound_key += str(protrefcount) + '/' + 'Hybridization Assay Name' + '/' + node.name - df.loc[i, 'protocol[' + str(protrefcount) + ']_prop[' + 'Array Design REF' + ']'] = node.array_design_ref - compound_key += str(protrefcount) + '/' + 'Array Design REF' + '/' + node.array_design_ref - elif node.executes_protocol.protocol_type.term == 'data collection': - df.loc[i, 'protocol[' + str(protrefcount) + ']_prop[' + 'Scan Name' + ']'] = node.name - compound_key += str(protrefcount) + '/' + 'Scan Name' + '/' + node.name - elif node.executes_protocol.protocol_type.term == 'mass spectrometry': - df.loc[i, 'protocol[' + str(protrefcount) + ']_prop[' + 'MS Assay Name' + ']'] = node.name - compound_key += str(protrefcount) + '/' + 'MS Assay Name' + '/' + node.name - for output in [x for x in node.outputs if isinstance(x, DataFile)]: - df.loc[i, 'data[' + output.label + ']'] = output.filename - if output.comments: - for comment in output.comments: - df.loc[i, 'data[' + output.label + ']_comment[' + comment.name + ']'] = comment.value - df.loc[i, 'compound_key'] = compound_key - i += 1 - - # reduce rows of data on separate lines - - # can we group by matching all columns minus the data columns? - # cols_no_data = [col for col in cols if not _RX_DATA.match(col)] # column list without data cols - - # calculate groupings - def rolling_group(val): - global prev - if val != prev: - rolling_group.group += 1 # val != prev is signal to switch group; rows sorted by cols_no_data - prev = val - return rolling_group.group - rolling_group.group = 0 # static variable - groups = df.groupby(df['compound_key'].apply(rolling_group), as_index=True) # groups by column 1 only - - # merge items in column groups - def reduce(group, column): - col = group[column] - s = [str(each) for each in col if pd.notnull(each)] - if len(s) > 0: - return s[0] - else: - return '' - df = groups.apply(lambda g: pd.Series([reduce(g, col) for col in g.columns], index=g.columns)) - - # cleanup column headers before writing out df - # WARNING: don't just dump out col_map.values() as we need to put columns back in order - df = df.sort_values(by=df.columns[0], ascending=True) # arbitrary sort on column 0 (Sample name) - del df['compound_key'] # release compound_key as we don't write it out - for i, col in enumerate(df.columns): - cols[i] = col_map[col] - if col_map[col] == 'Characteristics[Material Type]': - cols[i] = 'Material Type' - if col_map[col] == 'Parameter Value[Array Design REF]': - cols[i] = 'Array Design REF' - if _RX_DATA.match(col) is not None: - if _RX_DATA.findall(col)[0] == 'Raw Data File': - if assay_obj.technology_type.term == 'DNA microarray': - cols[i] = 'Array Data File' - df.columns = cols # reset column headers - # drop completely empty columns - import numpy as np - df = df.replace('', np.nan) - df = df.dropna(axis=1, how='all') - assay_obj.df = df - with open(os.path.join(output_dir, assay_obj.filename), 'w') as out_fp: - df.to_csv(path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8',) - def write_study_table_files(inv_obj, output_dir): """ @@ -875,92 +530,39 @@ def write_study_table_files(inv_obj, output_dir): protnames = dict() flatten = lambda l: [item for sublist in l for item in sublist] - jcolumns = [] - - def get_value_columns(label, x): - if isinstance(x.value, (int, float)) and x.unit: - if isinstance(x.unit, OntologyAnnotation): - return map(lambda x: "{0}.{1}".format(label, x), - ["Unit", "Unit.Term Source REF", "Unit.Term Accession Number"]) - else: - return ["{0}.Unit".format(label)] - elif isinstance(x.value, OntologyAnnotation): - return map(lambda x: "{0}.{1}".format(label, x), ["Term Source REF", "Term Accession Number"]) - else: - return [] - - def get_characteristic_columns(label, c): - columns = ["{0}.Characteristics[{1}]".format(label, c.category.term)] - columns.extend(get_value_columns(columns[0], c)) - return columns - - def get_fv_columns(label, fv): - columns = ["{0}.Factor Value[{1}]".format(label, fv.factor_name.name)] - columns.extend(get_value_columns(columns[0], fv)) - return columns - - def write_value_columns(df_dict, label, x): - if isinstance(x.value, (int, float)) and x.unit: - if isinstance(x.unit, OntologyAnnotation): - df_dict[label][-1] = x.value - df_dict[label + ".Unit"][-1] = x.unit.term - df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name if x.unit.term_source else "" - df_dict[label + ".Unit.Term Accession Number"][-1] = x.unit.term_accession - else: - df_dict[label][-1] = x.value - df_dict[label + ".Unit"][-1] = x.unit - elif isinstance(x.value, OntologyAnnotation): - try: - df_dict[label][-1] = x.value.term - except KeyError: - print(df_dict.keys()) - raise KeyError - df_dict[label + ".Term Source REF"][-1] = x.value.term_source.name if x.value.term_source else "" - df_dict[label + ".Term Accession Number"][-1] = x.value.term_accession - else: - df_dict[label][-1] = x.value + columns = [] start_nodes, end_nodes = _get_start_end_nodes(study_obj.graph) paths = _all_end_to_end_paths(study_obj.graph, start_nodes, end_nodes) for node in _longest_path_and_attrs(paths): if isinstance(node, Source): olabel = "Source Name" - jcolumns.append(olabel) - - jcolumns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) + columns.append(olabel) + columns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) elif isinstance(node, Process): olabel = "Protocol REF.{}".format(node.executes_protocol.name) - jcolumns.append(olabel) - + columns.append(olabel) if node.date is not None: - jcolumns.append(olabel + ".Date") + columns.append(olabel + ".Date") if node.performer is not None: - jcolumns.append(olabel + ".Performer") - - def get_pv_columns(label, pv): - columns = ["{0}.Parameter Value[{1}]".format(label, pv.category.parameter_name.term)] - columns.extend(get_value_columns(columns[0], pv)) - return columns - - jcolumns += flatten(map(lambda x: get_pv_columns(olabel, x), node.parameter_values)) - + columns.append(olabel + ".Performer") + columns += flatten(map(lambda x: get_pv_columns(olabel, x), node.parameter_values)) if node.executes_protocol.name not in protnames.keys(): protnames[node.executes_protocol.name] = protrefcount protrefcount += 1 elif isinstance(node, Sample): olabel = "Sample Name" - jcolumns.append(olabel) + columns.append(olabel) + columns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) + columns += flatten(map(lambda x: get_fv_columns(olabel, x), node.factor_values)) - jcolumns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) - jcolumns += flatten(map(lambda x: get_fv_columns(olabel, x), node.factor_values)) - - omap = get_object_column_map(jcolumns, jcolumns) + omap = get_object_column_map(columns, columns) # load into dictionary df_dict = dict(map(lambda k: (k, []), flatten(omap))) - + from progressbar import ProgressBar, SimpleProgress, Bar, ETA pbar = ProgressBar(min_value=0, max_value=len(paths), widgets=['Writing {} paths: '.format(len(paths)), SimpleProgress(), @@ -975,24 +577,17 @@ def get_pv_columns(label, pv): if isinstance(node, Source): olabel = "Source Name" df_dict[olabel][-1] = node.name - for c in node.characteristics: clabel = "{0}.Characteristics[{1}]".format(olabel, c.category.term) - try: - write_value_columns(df_dict, clabel, c) - except KeyError: - print(node.__dict__) - raise KeyError + write_value_columns(df_dict, clabel, c) elif isinstance(node, Process): olabel = "Protocol REF.{}".format(node.executes_protocol.name) df_dict[olabel][-1] = node.executes_protocol.name - if node.date is not None: df_dict[olabel + ".Date"][-1] = node.date if node.performer is not None: df_dict[olabel + ".Performer"][-1] = node.performer - for pv in node.parameter_values: pvlabel = "{0}.Parameter Value[{1}]".format(olabel, pv.category.parameter_name.term) write_value_columns(df_dict, pvlabel, pv) @@ -1000,7 +595,6 @@ def get_pv_columns(label, pv): elif isinstance(node, Sample): olabel = "Sample Name" df_dict[olabel][-1] = node.name - for c in node.characteristics: clabel = "{0}.Characteristics[{1}]".format(olabel, c.category.term) write_value_columns(df_dict, clabel, c) @@ -1009,33 +603,33 @@ def get_pv_columns(label, pv): write_value_columns(df_dict, fvlabel, fv) pbar.finish() - DF = pd.DataFrame(columns=jcolumns) + DF = pd.DataFrame(columns=columns) DF = DF.from_dict(data=df_dict) - DF = DF[jcolumns] # reorder columns + DF = DF[columns] # reorder columns DF = DF.sort_values(by=DF.columns[0], ascending=True) # arbitrary sort on column 0 - for i, col in enumerate(jcolumns): + for i, col in enumerate(columns): if col.endswith("Term Source REF"): - jcolumns[i] = "Term Source REF" + columns[i] = "Term Source REF" elif col.endswith("Term Accession Number"): - jcolumns[i] = "Term Accession Number" + columns[i] = "Term Accession Number" elif col.endswith("Unit"): - jcolumns[i] = "Unit" + columns[i] = "Unit" elif "Characteristics[" in col: if "material type" in col.lower(): - jcolumns[i] = "Material Type" + columns[i] = "Material Type" else: - jcolumns[i] = col[col.rindex(".")+1:] + columns[i] = col[col.rindex(".") + 1:] elif "Factor Value[" in col: - jcolumns[i] = col[col.rindex(".")+1:] + columns[i] = col[col.rindex(".") + 1:] elif "Parameter Value[" in col: - jcolumns[i] = col[col.rindex(".")+1:] + columns[i] = col[col.rindex(".") + 1:] elif "Protocol REF" in col: - jcolumns[i] = "Protocol REF" + columns[i] = "Protocol REF" elif col.endswith("Date"): - jcolumns[i] = "Date" + columns[i] = "Date" elif col.endswith("Performer"): - jcolumns[i] = "Performer" + columns[i] = "Performer" print("Rendered {} paths".format(len(DF.index))) DF_no_dups = DF.drop_duplicates() @@ -1045,7 +639,7 @@ def get_pv_columns(label, pv): print("Writing {} rows".format(len(DF.index))) # reset columns, replace nan with empty string, drop empty columns - DF.columns = jcolumns + DF.columns = columns DF = DF.replace('', np.nan) DF = DF.dropna(axis=1, how='all') @@ -1053,6 +647,258 @@ def get_pv_columns(label, pv): DF.to_csv(path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8') +def write_assay_table_files(inv_obj, output_dir): + """ + Writes out assay table files according to pattern defined by + + Sample Name, + Protocol Ref: 'sample collection', [ ParameterValue[], ... ], + Material Name, [ Characteristics[], ... ] + [ FactorValue[], ... ] + + + """ + + if not isinstance(inv_obj, Investigation): + raise NotImplementedError + for study_obj in inv_obj.studies: + for assay_obj in study_obj.assays: + if assay_obj.graph is None: break + protrefcount = 0 + protnames = dict() + + flatten = lambda l: [item for sublist in l for item in sublist] + columns = [] + + start_nodes, end_nodes = _get_start_end_nodes(assay_obj.graph) + paths = _all_end_to_end_paths(assay_obj.graph, start_nodes, end_nodes) + for node in _longest_path_and_attrs(paths): + if isinstance(node, Sample): + olabel = "Sample Name" + columns.append(olabel) + # columns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) + # columns += flatten(map(lambda x: get_fv_columns(olabel, x), node.factor_values)) + + elif isinstance(node, Process): + olabel = "Protocol REF.{}".format(node.executes_protocol.name) + columns.append(olabel) + if node.date is not None: + columns.append(olabel + ".Date") + if node.performer is not None: + columns.append(olabel + ".Performer") + oname_label = None + if node.executes_protocol.protocol_type.term == "nucleic acid sequencing": + oname_label = "Assay Name" + elif node.executes_protocol.protocol_type.term == "data collection": + oname_label = "Scan Name" + elif node.executes_protocol.protocol_type.term == "mass spectrometry": + oname_label = "MS Assay Name" + if oname_label is not None: + columns.append(oname_label) + elif node.executes_protocol.protocol_type.term == "nucleic acid hybridization": + columns.extend(["Hybridization Assay Name", "Array Design REF"]) + + columns += flatten(map(lambda x: get_pv_columns(olabel, x), node.parameter_values)) + if node.executes_protocol.name not in protnames.keys(): + protnames[node.executes_protocol.name] = protrefcount + protrefcount += 1 + + for output in [x for x in node.outputs if isinstance(x, DataFile)]: + columns.append(output.label) + columns += flatten(map(lambda x: get_comment_column(output.label, x), output.comments)) + + elif isinstance(node, Material): + olabel = node.type + columns.append(olabel) + columns += flatten(map(lambda x: get_characteristic_columns(olabel, x), node.characteristics)) + + elif isinstance(node, DataFile): + pass # handled in process + + omap = get_object_column_map(columns, columns) + + # load into dictionary + df_dict = dict(map(lambda k: (k, []), flatten(omap))) + + from progressbar import ProgressBar, SimpleProgress, Bar, ETA + pbar = ProgressBar(min_value=0, max_value=len(paths), widgets=['Writing {} paths: '.format(len(paths)), + SimpleProgress(), + Bar(left=" |", right="| "), ETA()]).start() + + for path in pbar(paths): + for k in df_dict.keys(): # add a row per path + df_dict[k].extend([""]) + + for node in path: + + if isinstance(node, Source): + olabel = "Source Name" + df_dict[olabel][-1] = node.name + for c in node.characteristics: + clabel = "{0}.Characteristics[{1}]".format(olabel, c.category.term) + try: + write_value_columns(df_dict, clabel, c) + except KeyError: + print(node.__dict__) + raise KeyError + + elif isinstance(node, Process): + olabel = "Protocol REF.{}".format(node.executes_protocol.name) + df_dict[olabel][-1] = node.executes_protocol.name + if node.date is not None: + df_dict[olabel + ".Date"][-1] = node.date + if node.performer is not None: + df_dict[olabel + ".Performer"][-1] = node.performer + for pv in node.parameter_values: + pvlabel = "{0}.Parameter Value[{1}]".format(olabel, pv.category.parameter_name.term) + write_value_columns(df_dict, pvlabel, pv) + oname_label = None + if node.executes_protocol.protocol_type.term == "nucleic acid sequencing": + oname_label = "Assay Name" + elif node.executes_protocol.protocol_type.term == "data collection": + oname_label = "Scan Name" + elif node.executes_protocol.protocol_type.term == "mass spectrometry": + oname_label = "MS Assay Name" + if oname_label is not None: + df_dict[oname_label][-1] = node.name + elif node.executes_protocol.protocol_type.term == "nucleic acid hybridization": + df_dict["Hybridization Assay Name"][-1] = node.name + df_dict["Array Design REF"][-1] = node.array_design_ref + for output in [x for x in node.outputs if isinstance(x, DataFile)]: + olabel = output.label + df_dict[olabel][-1] = output.filename + for co in output.comments: + colabel = "{0}.Comment[{1}]".format(olabel, co.name) + df_dict[colabel][-1] = co.value + + elif isinstance(node, Sample): + olabel = "Sample Name" + df_dict[olabel][-1] = node.name + # for c in node.characteristics: + # clabel = "{0}.Characteristics[{1}]".format(olabel, c.category.term) + # write_value_columns(df_dict, clabel, c) + # for fv in node.factor_values: + # fvlabel = "{0}.Factor Value[{1}]".format(olabel, fv.factor_name.name) + # write_value_columns(df_dict, fvlabel, fv) + + elif isinstance(node, Material): + olabel = node.type + df_dict[olabel][-1] = node.name + for c in node.characteristics: + clabel = "{0}.Characteristics[{1}]".format(olabel, c.category.term) + write_value_columns(df_dict, clabel, c) + + elif isinstance(node, DataFile): + pass # handled in process + + pbar.finish() + + DF = pd.DataFrame(columns=columns) + DF = DF.from_dict(data=df_dict) + DF = DF[columns] # reorder columns + DF = DF.sort_values(by=DF.columns[0], ascending=True) # arbitrary sort on column 0 + + for i, col in enumerate(columns): + if col.endswith("Term Source REF"): + columns[i] = "Term Source REF" + elif col.endswith("Term Accession Number"): + columns[i] = "Term Accession Number" + elif col.endswith("Unit"): + columns[i] = "Unit" + elif "Characteristics[" in col: + if "material type" in col.lower(): + columns[i] = "Material Type" + elif "label" in col.lower(): + columns[i] = "Label" + else: + columns[i] = col[col.rindex(".") + 1:] + elif "Factor Value[" in col: + columns[i] = col[col.rindex(".") + 1:] + elif "Parameter Value[" in col: + columns[i] = col[col.rindex(".") + 1:] + elif "Protocol REF" in col: + columns[i] = "Protocol REF" + elif col.endswith("Date"): + columns[i] = "Date" + elif col.endswith("Performer"): + columns[i] = "Performer" + elif "Comment[" in col: + columns[i] = col[col.rindex(".") + 1:] + + print("Rendered {} paths".format(len(DF.index))) + if len(DF.index) > 1: + if len(DF.index) > len(DF.drop_duplicates().index): + print("Dropping duplicates...") + DF = DF.drop_duplicates() + + print("Writing {} rows".format(len(DF.index))) + # reset columns, replace nan with empty string, drop empty columns + DF.columns = columns + DF = DF.replace('', np.nan) + DF = DF.dropna(axis=1, how='all') + + with open(os.path.join(output_dir, assay_obj.filename), 'w') as out_fp: + DF.to_csv(path_or_buf=out_fp, index=False, sep='\t', encoding='utf-8') + + +def get_value_columns(label, x): + if isinstance(x.value, (int, float)) and x.unit: + if isinstance(x.unit, OntologyAnnotation): + return map(lambda x: "{0}.{1}".format(label, x), + ["Unit", "Unit.Term Source REF", "Unit.Term Accession Number"]) + else: + return ["{0}.Unit".format(label)] + elif isinstance(x.value, OntologyAnnotation): + return map(lambda x: "{0}.{1}".format(label, x), ["Term Source REF", "Term Accession Number"]) + else: + return [] + + +def get_characteristic_columns(label, c): + columns = ["{0}.Characteristics[{1}]".format(label, c.category.term)] + columns.extend(get_value_columns(columns[0], c)) + return columns + + +def get_fv_columns(label, fv): + columns = ["{0}.Factor Value[{1}]".format(label, fv.factor_name.name)] + columns.extend(get_value_columns(columns[0], fv)) + return columns + + +def get_comment_column(label, c): + columns = ["{0}.Comment[{1}]".format(label, c.name)] + return columns + + +def write_value_columns(df_dict, label, x): + if isinstance(x.value, (int, float)) and x.unit: + if isinstance(x.unit, OntologyAnnotation): + df_dict[label][-1] = x.value + df_dict[label + ".Unit"][-1] = x.unit.term + df_dict[label + ".Unit.Term Source REF"][-1] = x.unit.term_source.name if x.unit.term_source else "" + df_dict[label + ".Unit.Term Accession Number"][-1] = x.unit.term_accession + else: + df_dict[label][-1] = x.value + df_dict[label + ".Unit"][-1] = x.unit + elif isinstance(x.value, OntologyAnnotation): + try: + df_dict[label][-1] = x.value.term + except KeyError: + print(df_dict.keys()) + raise KeyError + df_dict[label + ".Term Source REF"][-1] = x.value.term_source.name if x.value.term_source else "" + df_dict[label + ".Term Accession Number"][-1] = x.value.term_accession + else: + df_dict[label][-1] = x.value + + +def get_pv_columns(label, pv): + columns = ["{0}.Parameter Value[{1}]".format(label, pv.category.parameter_name.term)] + columns.extend(get_value_columns(columns[0], pv)) + return columns + + def read_investigation_file(fp): def _peek(f): @@ -2827,7 +2673,8 @@ def validate2(fp, config_dir=default_config_dir, log_level=logging.INFO): else: from isatools import utils try: - utils.detect_isatab_process_pooling(os.path.dirname(fp.name)) + fp.seek(0) + utils.detect_isatab_process_pooling(fp) except: pass logger.info("Finished validation...") @@ -2932,7 +2779,7 @@ def dumps(isa_obj): return output -def load(FP): # from DF of investigation file +def load(FP, skip_load_tables=False): # from DF of investigation file def get_ontology_source(term_source_ref): try: @@ -3080,30 +2927,33 @@ def get_contacts(section_df): study.protocols.append(protocol) protocol_map[protocol.name] = protocol study.protocols = list(protocol_map.values()) - study_tfile_df = read_tfile(os.path.join(os.path.dirname(FP.name), study.filename)) - sources, samples, _, __, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( - investigation.ontology_source_references, study_protocols=study.protocols, - study_factors=study.factors).create_from_df(study_tfile_df) - study.materials['sources'] = list(sources.values()) - study.materials['samples'] = list(samples.values()) - study.process_sequence = list(processes.values()) - study.characteristic_categories = list(characteristic_categories.values()) - study.units = list(unit_categories.values()) - - for process in study.process_sequence: - try: - process.executes_protocol = protocol_map[process.executes_protocol] - except KeyError: + if skip_load_tables: + pass + else: + study_tfile_df = read_tfile(os.path.join(os.path.dirname(FP.name), study.filename)) + sources, samples, _, __, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( + investigation.ontology_source_references, study_protocols=study.protocols, + study_factors=study.factors).create_from_df(study_tfile_df) + study.materials['sources'] = list(sources.values()) + study.materials['samples'] = list(samples.values()) + study.process_sequence = list(processes.values()) + study.characteristic_categories = list(characteristic_categories.values()) + study.units = list(unit_categories.values()) + + for process in study.process_sequence: try: - unknown_protocol = protocol_map['unknown'] + process.executes_protocol = protocol_map[process.executes_protocol] except KeyError: - protocol_map['unknown'] = Protocol( - name="unknown protocol", - description="This protocol was auto-generated where a protocol could not be determined.") - unknown_protocol = protocol_map['unknown'] - study.protocols.append(unknown_protocol) - process.executes_protocol = unknown_protocol - + try: + unknown_protocol = protocol_map['unknown'] + except KeyError: + protocol_map['unknown'] = Protocol( + name="unknown protocol", + description="This protocol was auto-generated where a protocol could not be determined.") + unknown_protocol = protocol_map['unknown'] + study.protocols.append(unknown_protocol) + process.executes_protocol = unknown_protocol + for _, row in df_dict['s_assays'][i].iterrows(): assay = Assay() assay.filename = row['Study Assay File Name'] @@ -3118,38 +2968,38 @@ def get_contacts(section_df): row['Study Assay Technology Type Term Source REF'] ) assay.technology_platform = row['Study Assay Technology Platform'] - - assay_tfile_df = read_tfile(os.path.join(os.path.dirname(FP.name), assay.filename)) - _, samples, other, data, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( - investigation.ontology_source_references, - study.materials['samples'], - study.protocols, - study.factors).create_from_df(assay_tfile_df) - assay.materials['samples'] = list(samples.values()) - assay.materials['other_material'] = list(other.values()) - assay.data_files = list(data.values()) - assay.process_sequence = list(processes.values()) - assay.characteristic_categories = list(characteristic_categories.values()) - assay.units = list(unit_categories.values()) - - for process in assay.process_sequence: - try: - process.executes_protocol = protocol_map[process.executes_protocol] - except KeyError: + if skip_load_tables: + pass + else: + assay_tfile_df = read_tfile(os.path.join(os.path.dirname(FP.name), assay.filename)) + _, samples, other, data, processes, characteristic_categories, unit_categories = ProcessSequenceFactory( + investigation.ontology_source_references, + study.materials['samples'], + study.protocols, + study.factors).create_from_df(assay_tfile_df) + assay.materials['samples'] = list(samples.values()) + assay.materials['other_material'] = list(other.values()) + assay.data_files = list(data.values()) + assay.process_sequence = list(processes.values()) + assay.characteristic_categories = list(characteristic_categories.values()) + assay.units = list(unit_categories.values()) + + for process in assay.process_sequence: try: - unknown_protocol = protocol_map['unknown'] + process.executes_protocol = protocol_map[process.executes_protocol] except KeyError: - protocol_map['unknown'] = Protocol( - name="unknown protocol", - description="This protocol was auto-generated where a protocol could not be determined.") - unknown_protocol = protocol_map['unknown'] - study.protocols.append(unknown_protocol) - process.executes_protocol = unknown_protocol - + try: + unknown_protocol = protocol_map['unknown'] + except KeyError: + protocol_map['unknown'] = Protocol( + name="unknown protocol", + description="This protocol was auto-generated where a protocol could not be determined.") + unknown_protocol = protocol_map['unknown'] + study.protocols.append(unknown_protocol) + process.executes_protocol = unknown_protocol + study.assays.append(assay) - investigation.studies.append(study) - return investigation @@ -3686,28 +3536,21 @@ def get_node_by_label_and_key(l, k): # now go row by row pulling out processes and linking them accordingly for _, object_series in DF.iterrows(): # don't drop duplicates - process_key_sequence = list() for _cg, column_group in enumerate(object_column_map): - # for each object, parse column group - object_label = column_group[0] if object_label.startswith('Protocol REF'): - protocol_ref = object_series[column_group[0]] - process_key = process_keygen(protocol_ref, column_group, _cg, DF.columns, object_series, _, DF) - process_key_sequence.append(process_key) # print('key sequence = ', process_key_sequence) # Link the processes in each sequence for pair in pairwise(process_key_sequence): # TODO: Make split/pool model with multi prev/next_process - l = processes[pair[0]] # get process on left of pair r = processes[pair[1]] # get process on right of pair diff --git a/isatools/model/v1.py b/isatools/model/v1.py index d728faf..f314984 100644 --- a/isatools/model/v1.py +++ b/isatools/model/v1.py @@ -84,18 +84,10 @@ class Commentable(object): comments (list, NoneType): Comments associated with the implementing ISA class (all ISA classes). """ def __init__(self, comments=None): - self.comments = comments - - @property - def comments(self): - return self.__comments - - @comments.setter - def comments(self, comments): - if comments is not None and not isinstance(comments, list): - raise AttributeError("comments must be an instance of list or None") + if comments is None: + self.comments = [] else: - self.__comments = comments + self.comments = comments class Investigation(Commentable): diff --git a/isatools/utils.py b/isatools/utils.py index a250526..8533da2 100644 --- a/isatools/utils.py +++ b/isatools/utils.py @@ -36,14 +36,10 @@ def detect_graph_process_pooling(G): return report -def detect_isatab_process_pooling(tab_path): - from isatools.convert import isatab2json - from isatools import isajson - from io import StringIO - import json - report = list() - J = isatab2json.convert(tab_path, validate_first=False, use_new_parser=True) - ISA = isajson.load(StringIO(json.dumps(J))) +def detect_isatab_process_pooling(fp): + from isatools import isatab + report = [] + ISA = isatab.load(fp) for study in ISA.studies: print("Checking {}".format(study.filename)) pooling_list = detect_graph_process_pooling(study.graph) diff --git a/tests/test_isatab.py b/tests/test_isatab.py index 30287ee..2149bcd 100644 --- a/tests/test_isatab.py +++ b/tests/test_isatab.py @@ -345,11 +345,8 @@ def test_isatab_load_bii_s_7(self): self.assertEqual(len(assay_gx.data_files), 29) # 29 data files in a_matteo-assay-Gx.txt self.assertEqual(len(assay_gx.process_sequence), 116) # 116 processes in in a_matteo-assay-Gx.txt - # def test_isatab_load_flower(self): + # def test_isatab_load_flower(self): # don't commit this as it takes a few minutes to run # with open(os.path.join(self._tab_data_dir, 'Flower_Study', 'i_Investigation.txt')) as fp: # ISA = isatab.load(fp) - # for s in ISA.studies[0].materials['samples']: - # s.derives_from = [so for so in ISA.studies[0].materials['sources'] if so.name == s.name] - # ISA.studies[0].assays = [] # print(isatab.dumps(ISA)) diff --git a/tests/test_isatab2json.py b/tests/test_isatab2json.py index e2c346a..b7e16f0 100644 --- a/tests/test_isatab2json.py +++ b/tests/test_isatab2json.py @@ -161,56 +161,56 @@ def tearDown(self): def test_isatab2json_convert_charac_param_factor(self): test_case = 'TEST-ISA-charac-param-factor' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_repeated_measure(self): test_case = 'TEST-ISA-repeated-measure' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_sample_pool(self): test_case = 'TEST-ISA-sample-pool' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_sample_pool_with_error(self): test_case = 'TEST-ISA-sample-pool-with-error' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_source_split(self): test_case = 'TEST-ISA-source-split' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_source_split_with_error(self): test_case = 'TEST-ISA-source-split-with-error' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_bii_s_3(self): test_case = 'BII-S-3' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) def test_isatab2json_convert_bii_s_7(self): test_case = 'BII-S-7' - actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), self.identifier_type, validate_first=False) + actual_json = isatab2json.convert(os.path.join(self._tab_data_dir, test_case), validate_first=False) with open(os.path.join(self._json_data_dir, test_case, test_case + '.json')) as expected_file: expected_json = json.load(expected_file) self.assertTrue(utils.assert_json_equal(expected_json, actual_json)) diff --git a/tests/test_isatools_utils.py b/tests/test_isatools_utils.py index 60a4b93..0bda52e 100644 --- a/tests/test_isatools_utils.py +++ b/tests/test_isatools_utils.py @@ -31,7 +31,7 @@ def test_detect_graph_process_pooling(self): print("Checking {}".format(assay.filename)) pooling_list = utils.detect_graph_process_pooling(assay.graph) self.assertListEqual(sorted(pooling_list), - sorted(['#process/Extraction1', '#process/ADG_normalized_data.xlsx'])) + sorted(['#process/Extraction1'])) def test_detect_graph_process_pooling_batch_on_mtbls(self): for i in range(1, 1):