From 823206c30e6e665494c53d0ea62c1eacd47b8b01 Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Thu, 11 Jul 2024 15:36:30 -0300 Subject: [PATCH 01/10] Adjusted tests for omim changes. --- src/test/specific_tests.py | 2 +- src/test/unit_tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/specific_tests.py b/src/test/specific_tests.py index 07b75707..7d5bdc76 100644 --- a/src/test/specific_tests.py +++ b/src/test/specific_tests.py @@ -1168,7 +1168,7 @@ def test_human_dej_has_omim_full_url_cross_reference(): """Test Human DEJ has OMIM Full URL Cross Reference""" query = """MATCH (g:Gene)--(dej:DiseaseEntityJoin)--(cr:CrossReference) - WHERE cr.crossRefCompleteUrl = 'https://www.omim.org/entry/605242' + WHERE cr.crossRefCompleteUrl = 'https://www.omim.org/605242' RETURN count(cr) AS counter""" with Neo4jHelper.run_single_query(query) as result: for record in result: diff --git a/src/test/unit_tests.py b/src/test/unit_tests.py index 40dc0038..c9fb5551 100644 --- a/src/test/unit_tests.py +++ b/src/test/unit_tests.py @@ -73,7 +73,7 @@ def test_url_lookup(self): 'result': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C5604'}, {'local_id': 'badregexdoesnotmatch', 'global_id': 'MESH:badregexdoesnotmatch', 'result': 'https://www.ncbi.nlm.nih.gov/mesh/badregexdoesnotmatch'}, - {'local_id': 'Cdiff', 'global_id': 'MIM:1111', 'result': 'https://www.omim.org/entry/1111'}] + {'local_id': 'Cdiff', 'global_id': 'MIM:1111', 'result': 'https://www.omim.org/1111'}] for item in lookups: url = self.etlh.get_complete_url_ont(item['local_id'], item['global_id']) From cd428b550c8d62f462f635eaf3017fc29c139857 Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Thu, 11 Jul 2024 15:41:41 -0300 Subject: [PATCH 02/10] Additional OMIM and MIM changes. --- src/etl/disease_etl.py | 2 +- src/etl/helpers/etl_helper.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/etl/disease_etl.py b/src/etl/disease_etl.py index 38633f73..6f08c086 100644 --- a/src/etl/disease_etl.py +++ b/src/etl/disease_etl.py @@ -315,7 +315,7 @@ def process_pages(self, dp, xrefs, pages): for page in pages: if (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'DOID': display_name = 'RGD' - elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and prefix == 'OMIM': + elif (self.data_provider == 'RGD' or self.data_provider == 'HUMAN') and (prefix == 'OMIM' or prefix == 'MIM'): display_name = 'OMIM' else: display_name = cross_ref_id.split(":")[0] diff --git a/src/etl/helpers/etl_helper.py b/src/etl/helpers/etl_helper.py index 2e3bf442..34ed5511 100644 --- a/src/etl/helpers/etl_helper.py +++ b/src/etl/helpers/etl_helper.py @@ -154,7 +154,7 @@ def get_xref_dict(local_id, prefix, cross_ref_type, page, def get_complete_url_ont(self, local_id, global_id, key=None): """Get Complete 'ont'.""" page = None - if 'OMIM:PS' in global_id: + if 'OMIM:PS' or 'MIM:PS' in global_id: page = 'ont' if not key: # split not done before hand From 3470971178ee73fa8f769e0e32a43ce3bb1bbac6 Mon Sep 17 00:00:00 2001 From: Chris Tabone Date: Thu, 11 Jul 2024 20:12:36 -0300 Subject: [PATCH 03/10] Update specific_tests.py --- src/test/specific_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/specific_tests.py b/src/test/specific_tests.py index 7d5bdc76..4cf27ff0 100644 --- a/src/test/specific_tests.py +++ b/src/test/specific_tests.py @@ -1168,7 +1168,7 @@ def test_human_dej_has_omim_full_url_cross_reference(): """Test Human DEJ has OMIM Full URL Cross Reference""" query = """MATCH (g:Gene)--(dej:DiseaseEntityJoin)--(cr:CrossReference) - WHERE cr.crossRefCompleteUrl = 'https://www.omim.org/605242' + WHERE cr.crossRefCompleteUrl = 'https://www.omim.org/MIM:605242' RETURN count(cr) AS counter""" with Neo4jHelper.run_single_query(query) as result: for record in result: From 15859dd03222d6db08813f84aba9744b2eb21c10 Mon Sep 17 00:00:00 2001 From: Chris Tabone Date: Thu, 11 Jul 2024 20:13:08 -0300 Subject: [PATCH 04/10] Update unit_tests.py --- src/test/unit_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/unit_tests.py b/src/test/unit_tests.py index c9fb5551..4f000660 100644 --- a/src/test/unit_tests.py +++ b/src/test/unit_tests.py @@ -73,7 +73,7 @@ def test_url_lookup(self): 'result': 'https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&code=C5604'}, {'local_id': 'badregexdoesnotmatch', 'global_id': 'MESH:badregexdoesnotmatch', 'result': 'https://www.ncbi.nlm.nih.gov/mesh/badregexdoesnotmatch'}, - {'local_id': 'Cdiff', 'global_id': 'MIM:1111', 'result': 'https://www.omim.org/1111'}] + {'local_id': 'Cdiff', 'global_id': 'MIM:1111', 'result': 'https://www.omim.org/MIM:1111'}] for item in lookups: url = self.etlh.get_complete_url_ont(item['local_id'], item['global_id']) From dcec44165d81bc27cde721258013619dfd975d12 Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Fri, 12 Jul 2024 10:23:11 -0300 Subject: [PATCH 05/10] Debugging for later. --- src/etl/helpers/resource_descriptor_helper_2.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/etl/helpers/resource_descriptor_helper_2.py b/src/etl/helpers/resource_descriptor_helper_2.py index 7c9844e4..7cdfb56c 100644 --- a/src/etl/helpers/resource_descriptor_helper_2.py +++ b/src/etl/helpers/resource_descriptor_helper_2.py @@ -333,4 +333,11 @@ def return_url_from_identifier(self, identifier, page=None): self.bad_regex[key] = 1 else: self.bad_regex[key] += 1 + # if page == "ontology_provided_cross_reference": + # self.logger.info('Processing ontology_provided_cross_reference') + # self.logger.info('DB Prefix: %s', db_prefix) + # self.logger.info('Identifier: %s', identifier) + # self.logger.info('Identifier Processed: %s', identifier_post_processed) + # self.logger.info('GID Pattern: %s', gid_pattern) + # self.logger.info('Regex Match: %s', regex_output) return self.return_url_from_key_value(key, identifier_stripped, alt_page=page) From 1ad202185667bd23e416d85fa317951be64b8bf5 Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Fri, 12 Jul 2024 12:17:40 -0300 Subject: [PATCH 06/10] Debugging for strange errors. --- .../helpers/resource_descriptor_helper_2.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/etl/helpers/resource_descriptor_helper_2.py b/src/etl/helpers/resource_descriptor_helper_2.py index 7cdfb56c..877a448b 100644 --- a/src/etl/helpers/resource_descriptor_helper_2.py +++ b/src/etl/helpers/resource_descriptor_helper_2.py @@ -325,19 +325,19 @@ def return_url_from_identifier(self, identifier, page=None): regex_output = re.match(gid_pattern, identifier_post_processed, re.IGNORECASE) if regex_output is None: if key not in self.bad_regex: - self.logger.warning('Cross Reference identifier did not match Resource Descriptor YAML file gid pattern.') - self.logger.warning('Database prefix: %s', db_prefix) - self.logger.warning('Identifier: %s', identifier_post_processed) - self.logger.warning('gid pattern: %s', gid_pattern) - self.logger.warning('page: %s', page) + self.logger.info('Cross Reference identifier did not match Resource Descriptor YAML file gid pattern.') + self.logger.info('Database prefix: %s', db_prefix) + self.logger.info('Identifier: %s', identifier_post_processed) + self.logger.info('gid pattern: %s', gid_pattern) + self.logger.info('page: %s', page) self.bad_regex[key] = 1 else: self.bad_regex[key] += 1 - # if page == "ontology_provided_cross_reference": - # self.logger.info('Processing ontology_provided_cross_reference') - # self.logger.info('DB Prefix: %s', db_prefix) - # self.logger.info('Identifier: %s', identifier) - # self.logger.info('Identifier Processed: %s', identifier_post_processed) - # self.logger.info('GID Pattern: %s', gid_pattern) - # self.logger.info('Regex Match: %s', regex_output) + if page == "ontology_provided_cross_reference": + self.logger.info('Processing ontology_provided_cross_reference') + self.logger.info('DB Prefix: %s', db_prefix) + self.logger.info('Identifier: %s', identifier) + self.logger.info('Identifier Processed: %s', identifier_post_processed) + self.logger.info('GID Pattern: %s', gid_pattern) + self.logger.info('Regex Match: %s', regex_output) return self.return_url_from_key_value(key, identifier_stripped, alt_page=page) From 97459a9624a43c86473c01aaa809693bf4f5471e Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Mon, 15 Jul 2024 13:29:15 -0300 Subject: [PATCH 07/10] Minor update to accommodate edge cases. --- .../helpers/resource_descriptor_helper_2.py | 25 +++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/src/etl/helpers/resource_descriptor_helper_2.py b/src/etl/helpers/resource_descriptor_helper_2.py index 877a448b..9e8ce5aa 100644 --- a/src/etl/helpers/resource_descriptor_helper_2.py +++ b/src/etl/helpers/resource_descriptor_helper_2.py @@ -302,7 +302,13 @@ def return_url(self, identifier, page): def return_url_from_identifier(self, identifier, page=None): """Return URL for an identifier.""" db_prefix, identifier_stripped, separator = self.split_identifier(identifier) - + + # Normalize the identifier + # Special case for EFO. + if db_prefix and db_prefix.upper() == "EFO": + if not identifier_stripped.startswith("EFO_"): + identifier_stripped = "EFO_" + identifier_stripped + key = self.get_key(db_prefix, identifier) if not key: return None @@ -325,19 +331,12 @@ def return_url_from_identifier(self, identifier, page=None): regex_output = re.match(gid_pattern, identifier_post_processed, re.IGNORECASE) if regex_output is None: if key not in self.bad_regex: - self.logger.info('Cross Reference identifier did not match Resource Descriptor YAML file gid pattern.') - self.logger.info('Database prefix: %s', db_prefix) - self.logger.info('Identifier: %s', identifier_post_processed) - self.logger.info('gid pattern: %s', gid_pattern) - self.logger.info('page: %s', page) + self.logger.warning('Cross Reference identifier did not match Resource Descriptor YAML file gid pattern.') + self.logger.warning('Database prefix: %s', db_prefix) + self.logger.warning('Identifier: %s', identifier_post_processed) + self.logger.warning('gid pattern: %s', gid_pattern) + self.logger.warning('page: %s', page) self.bad_regex[key] = 1 else: self.bad_regex[key] += 1 - if page == "ontology_provided_cross_reference": - self.logger.info('Processing ontology_provided_cross_reference') - self.logger.info('DB Prefix: %s', db_prefix) - self.logger.info('Identifier: %s', identifier) - self.logger.info('Identifier Processed: %s', identifier_post_processed) - self.logger.info('GID Pattern: %s', gid_pattern) - self.logger.info('Regex Match: %s', regex_output) return self.return_url_from_key_value(key, identifier_stripped, alt_page=page) From 887accf06a32a794656304a1c9aa40c2d17ca0c0 Mon Sep 17 00:00:00 2001 From: Christopher Tabone Date: Mon, 15 Jul 2024 14:09:50 -0300 Subject: [PATCH 08/10] Triggering the PR again. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c5a2bcb5..62261e3d 100644 --- a/README.md +++ b/README.md @@ -74,3 +74,4 @@ _Reminder_: authentification needs to be renewed every time you get an error lik ``` Error response from daemon: pull access denied for 100225593120.dkr.ecr.us-east-1.amazonaws.com/agr_neo4j_env, repository does not exist or may require 'docker login': denied: Your authorization token has expired. Reauthenticate and try again. ``` + \ No newline at end of file From 5deb11e5aadf18777343183635f9c4747f23a4c9 Mon Sep 17 00:00:00 2001 From: Chris Tabone Date: Mon, 15 Jul 2024 17:56:25 -0300 Subject: [PATCH 09/10] Update etl_helper.py --- src/etl/helpers/etl_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/etl/helpers/etl_helper.py b/src/etl/helpers/etl_helper.py index 34ed5511..3a055c96 100644 --- a/src/etl/helpers/etl_helper.py +++ b/src/etl/helpers/etl_helper.py @@ -154,7 +154,7 @@ def get_xref_dict(local_id, prefix, cross_ref_type, page, def get_complete_url_ont(self, local_id, global_id, key=None): """Get Complete 'ont'.""" page = None - if 'OMIM:PS' or 'MIM:PS' in global_id: + if 'OMIM:PS' in global_id or 'MIM:PS' in global_id: page = 'ont' if not key: # split not done before hand From b0ea9e1e70c8c13133c6e753cbf5f8815e89c40f Mon Sep 17 00:00:00 2001 From: Chris Tabone Date: Mon, 15 Jul 2024 19:59:25 -0300 Subject: [PATCH 10/10] Update README.md Triggering GoCD. --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 62261e3d..87d5ba6a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,3 @@ -[![Build Status](https://travis-ci.org/alliance-genome/agr_loader.svg?branch=develop)](https://travis-ci.org/alliance-genome/agr_loader) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/5259a0e847c04c72a4a9c4f34fabfed5)](https://www.codacy.com/project/christabone/agr_loader/dashboard?utm_source=github.com&utm_medium=referral&utm_content=alliance-genome/agr_loader&utm_campaign=Badge_Grade_Dashboard) # Alliance of Genome Resources Loader @@ -74,4 +73,4 @@ _Reminder_: authentification needs to be renewed every time you get an error lik ``` Error response from daemon: pull access denied for 100225593120.dkr.ecr.us-east-1.amazonaws.com/agr_neo4j_env, repository does not exist or may require 'docker login': denied: Your authorization token has expired. Reauthenticate and try again. ``` - \ No newline at end of file +