From a8710573e28275a50de541edf5bac0e8bfb2a0cd Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 6 Nov 2019 16:21:00 +0100 Subject: [PATCH 1/2] Add curation tools from hbp --- src/pybel_tools/chem.py | 78 ++++++++ src/pybel_tools/citation_coocurrence.py | 75 ++++++++ src/pybel_tools/curation/__init__.py | 3 + src/pybel_tools/curation/planning/__init__.py | 3 + .../curation/planning/check_novelties.py | 82 ++++++++ src/pybel_tools/curation/planning/go.py | 86 +++++++++ src/pybel_tools/curation/planning/pathways.py | 59 ++++++ src/pybel_tools/curation/planning/pubmed.py | 51 +++++ .../curation/planning/tag_has_pmc.py | 137 +++++++++++++ .../curation/recuration/__init__.py | 3 + .../find_duplicate_recuration_issues.py | 47 +++++ .../recuration/make_recuration_issues.py | 135 +++++++++++++ src/pybel_tools/curation/recuration/utils.py | 32 ++++ .../curation/reporting/__init__.py | 3 + .../curation/reporting/check_bad_names.py | 22 +++ .../reporting/summarize_effort_per_curator.py | 56 ++++++ src/pybel_tools/curation/utils.py | 83 ++++++++ src/pybel_tools/normalize.py | 180 ++++++++++++++++++ 18 files changed, 1135 insertions(+) create mode 100644 src/pybel_tools/chem.py create mode 100644 src/pybel_tools/citation_coocurrence.py create mode 100644 src/pybel_tools/curation/__init__.py create mode 100644 src/pybel_tools/curation/planning/__init__.py create mode 100644 src/pybel_tools/curation/planning/check_novelties.py create mode 100644 src/pybel_tools/curation/planning/go.py create mode 100644 src/pybel_tools/curation/planning/pathways.py create mode 100644 src/pybel_tools/curation/planning/pubmed.py create mode 100644 src/pybel_tools/curation/planning/tag_has_pmc.py create mode 100644 src/pybel_tools/curation/recuration/__init__.py create mode 100644 src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py create mode 100644 src/pybel_tools/curation/recuration/make_recuration_issues.py create mode 100644 src/pybel_tools/curation/recuration/utils.py create mode 100644 src/pybel_tools/curation/reporting/__init__.py create mode 100644 src/pybel_tools/curation/reporting/check_bad_names.py create mode 100644 src/pybel_tools/curation/reporting/summarize_effort_per_curator.py create mode 100644 src/pybel_tools/curation/utils.py create mode 100644 src/pybel_tools/normalize.py diff --git a/src/pybel_tools/chem.py b/src/pybel_tools/chem.py new file mode 100644 index 00000000..c4d95199 --- /dev/null +++ b/src/pybel_tools/chem.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- + +"""Chemistry tools for BEL.""" + +import itertools as itt +from typing import Iterable, Tuple + +from rdkit import DataStructs +from rdkit.Chem import MACCSkeys, MolFromInchi +from tqdm import tqdm + +from pybel import BELGraph +from pybel.constants import ANNOTATIONS, IDENTIFIER, NAMESPACE +from pybel.dsl import BaseEntity +from pybel.struct import invert_node_predicate, node_predicate, remove_filtered_nodes + +__all__ = [ + 'add_similarity_edges', + 'remove_non_inchi', +] + + +def iter_inchi_nodes(graph: BELGraph) -> Iterable[Tuple[tuple, dict, str]]: + """Iterate over node tuple, node data, and InChI string triples.""" + for node, data in graph.nodes(data=True): + if node_is_inchi(data): + yield node, data, data.get(IDENTIFIER) + + +@node_predicate +def node_is_inchi(node: BaseEntity): + return 'inchi' == node.get(NAMESPACE) and IDENTIFIER in node + + +def remove_non_inchi(graph: BELGraph): + """Remove all non-inchi nodes.""" + remove_filtered_nodes(graph, invert_node_predicate(node_is_inchi)) + + +def add_similarity_edges(graph: BELGraph, cutoff: float = 0.8) -> None: + """Enrich a BEL graph with edges between chemicals that have InChI. + + :param graph: A BEL graph + :param cutoff: The cutoff for similarity + """ + inchi_to_node_tuple = { + inchi: node_tuple + for node_tuple, _, inchi in iter_inchi_nodes(graph) + } + + mols = { + inchi: MolFromInchi(inchi) + for inchi in inchi_to_node_tuple + } + + fps = { + inchi: MACCSkeys.GenMACCSKeys(mol) + for inchi, mol in tqdm(mols.items(), desc='calculating MACCS keys') + if mol is not None + } + + n_combinations = (len(fps) * (len(fps) - 1) / 2) + _sim_iter = ( + (x, y, DataStructs.FingerprintSimilarity(fps[x], fps[y])) + # can also use FingerprintMols + for x, y in tqdm(itt.combinations(fps, 2), total=n_combinations, desc='calculating similarity') + ) + + for x, y, sim in _sim_iter: + if sim < cutoff: + continue + + source, target = inchi_to_node_tuple[x], inchi_to_node_tuple[y] + + key = graph.add_unqualified_edge(source, target, relation='similar') + graph[source][target][key][ANNOTATIONS] = { + 'similarity': sim + } diff --git a/src/pybel_tools/citation_coocurrence.py b/src/pybel_tools/citation_coocurrence.py new file mode 100644 index 00000000..7b159eb8 --- /dev/null +++ b/src/pybel_tools/citation_coocurrence.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Build a network of citations connected by co-occurrence of entities.""" + +import itertools as itt +from collections import Counter, defaultdict +from typing import TextIO + +import click +import networkx as nx +from tqdm import tqdm + +from pybel import BELGraph, Manager +from pybel.cli import connection_option, graph_pickle_argument +from pybel.constants import CITATION, CITATION_REFERENCE, CITATION_TITLE, CITATION_TYPE, CITATION_TYPE_PUBMED +from pybel.manager.citation_utils import enrich_pubmed_citations + + +@click.command() +@connection_option +@graph_pickle_argument +@click.option('-o', '--output', type=click.File('w'), required=True) +@click.option('-t', '--threshold', type=int, default=1) +def main(connection: str, graph: BELGraph, output: TextIO, threshold): + """Build a citation network from the graph.""" + enrich_pubmed_citations(Manager(connection=connection), graph) + citation_network = make_citation_network(graph, threshold=threshold) + print('Source', 'Source Title', 'Target', 'Target Title', 'Shared', sep='\t', file=output) + for u, v, d in citation_network.edges(data=True): + print( + u, + citation_network.nodes[u]['title'], + v, + citation_network.nodes[v]['title'], + d['weight'], + sep='\t', + file=output, + ) + + +def make_citation_network(bel_graph: BELGraph, threshold: int = 0) -> nx.Graph: + """Make a citation network from the BEL graph based on which statements occur in multiple sourves.""" + dd = defaultdict(set) + names = {} + for u, v, k, d in bel_graph.edges(keys=True, data=True): + citation = d.get(CITATION) + if citation is None or citation[CITATION_TYPE] != CITATION_TYPE_PUBMED: + continue + reference = citation[CITATION_REFERENCE] + dd[reference].update((u, v)) + names[reference] = citation.get(CITATION_TITLE) + + all_nodes = set(itt.chain.from_iterable(dd.values())) + + iterator = itt.product(all_nodes, itt.combinations(dd.items(), r=2)) + iterator = tqdm(iterator, total=len(all_nodes) * (len(dd) ** 2)) + c = Counter( + (c1, c2) + for node, ((c1, c1_values), (c2, c2_values)) in iterator + if node in c1_values and node in c2_values + ) + + rv = nx.Graph() + for (c1, c2), weight in c.items(): + if weight >= threshold: + rv.add_edge(c1, c2, weight=weight) + + for reference, title in names.items(): + rv.nodes[reference]['title'] = title + + return rv + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/__init__.py b/src/pybel_tools/curation/__init__.py new file mode 100644 index 00000000..7fa123d9 --- /dev/null +++ b/src/pybel_tools/curation/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for curation.""" diff --git a/src/pybel_tools/curation/planning/__init__.py b/src/pybel_tools/curation/planning/__init__.py new file mode 100644 index 00000000..fd74c504 --- /dev/null +++ b/src/pybel_tools/curation/planning/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for planning curation.""" diff --git a/src/pybel_tools/curation/planning/check_novelties.py b/src/pybel_tools/curation/planning/check_novelties.py new file mode 100644 index 00000000..6aea4bff --- /dev/null +++ b/src/pybel_tools/curation/planning/check_novelties.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +"""This script assesses the novelty of pending curation tasks. + +Currently, is limited to articles where PMC is available to ensure +good INDRA coverage. +""" + +import json +import logging +from typing import Optional + +import click +from easy_config.contrib.click import args_from_config +from gitlab.v4.objects import Issue, Project +from hbp_knowledge import get_graph +from pybel_git.gitlab import GitlabConfig + +from pybel import BELGraph +from pybel_tools.assess_completeness import CompletenessSummary, assess_completeness +from ..recuration.utils import CURATION_LABEL + +_prefix = '- PMID: [' + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('-o', '--output', type=click.File('w')) +def main(project_id: int, url: str, token: str, output) -> None: + """Assess the completeness of HBP curation tasks with respect to CONIB.""" + logging.basicConfig(level=logging.INFO) + logging.getLogger('hbp').setLevel(logging.INFO) + + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + do_it(project, output) + + +def do_it(project: Project, output): + graph = get_graph() + + summaries = assess_project_completeness(project=project, graph=graph) + + if output is not None: + json.dump(list(summaries), output, indent=2) + else: + for summary in summaries: + click.echo(json.dumps(summary, indent=2)) + + +def assess_project_completeness(*, project: Project, graph: BELGraph): + """Summarize thee novelty of all issues in the project.""" + issues = project.issues.list(labels=[CURATION_LABEL]) + for issue in issues: + click.echo(f'Issue {issue.id}: {issue.title}') + s = assess_issue_completeness(issue=issue, graph=graph) + d = s.summary_dict() + yield d + + +def assess_issue_completeness(*, issue: Issue, graph: BELGraph) -> CompletenessSummary: + """Summarize the novelty of the PMID referenced by the issue.""" + pmid = _get_pmid(issue.description) + ids = ('pmid', pmid) + return assess_completeness(ids, graph) + + +def _get_pmid(description: str) -> Optional[str]: + for line in description.split('\n'): + line = line.strip() + if line.startswith(_prefix): + line: str = line[len(_prefix):] + line = line[:line.index(']')] + return line + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/go.py b/src/pybel_tools/curation/planning/go.py new file mode 100644 index 00000000..7741e944 --- /dev/null +++ b/src/pybel_tools/curation/planning/go.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for the Gene Ontology (GO). + +Run with `python -m hbp.curation.planning.go`. +""" + +from typing import List + +import click +import requests + +from ..utils import make_issues_from_pmids, min_year_option + +url = 'http://golr-aux.geneontology.io/solr/select' + +BASE_PARAMS = { + 'defType': ['edismax'], + 'qt': ['standard'], + 'indent': ['on'], + 'wt': ['csv'], + 'rows': ['100000'], + 'start': ['0'], 'fl': ['reference'], + 'facet': ['true'], + 'facet.mincount': ['1'], + 'facet.sort': ['count'], + 'json.nl': ['arrarr'], + 'facet.limit': ['25'], + 'hl': ['true'], + 'hl.simple.pre': [''], + 'hl.snippets': ['1000'], + 'csv.separator': ['\t'], + 'csv.header': ['false'], + 'csv.mv.separator': ['|'], + 'fq': ['document_category:"annotation"'], # add bioentity here too + 'facet.field': ['aspect', 'taxon_subset_closure_label', 'type', 'evidence_subset_closure_label', + 'regulates_closure_label', 'annotation_class_label', 'qualifier', + 'annotation_extension_class_closure_label', 'assigned_by', 'panther_family_label'], + 'q': ['*:*'], +} + + +def get_pmids_from_go_annotations_by_uniprot_id(uniprot_id: str) -> List[str]: + """Get the PubMed identifiers used in GO annotations for the given protein.""" + params = BASE_PARAMS.copy() + params['fq'].append(f'bioentity:"UniProtKB:{uniprot_id}"') + r = requests.get(url, params) + lines = ( + line.strip() + for line in r.text.splitlines() + ) + return list(sorted({ + line.split(':')[1] + for line in lines + if line and line.lower().startswith('pmid') + })) + + +@click.command() +@click.argument('uniprot_id') +@click.option('--namespace', type=click.Choice(['uniprot']), default='uniprot') +@min_year_option +@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository') +@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC') +@click.option('-l', '--label', multiple=True) +def main(uniprot_id: str, namespace: str, min_year: int, make_issues: bool, allow_closed: bool, label: List[str]): + """Get a list of documents for the given UniProt identifier. + + Example: Q13148. + """ + if namespace == 'uniprot': + pmids = get_pmids_from_go_annotations_by_uniprot_id(uniprot_id) + else: + raise ValueError(f'{namespace} is not yet supported') + + make_issues_from_pmids( + pmids, + min_year=min_year, + allow_closed=allow_closed, + make_issues=make_issues, + labels=label, + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/pathways.py b/src/pybel_tools/curation/planning/pathways.py new file mode 100644 index 00000000..d9db35e9 --- /dev/null +++ b/src/pybel_tools/curation/planning/pathways.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for the Gene Ontology (GO). + +Run with `python -m hbp.curation.planning.pathways`. +""" + +from typing import Iterable, Optional + +import click + +import bio2bel_kegg +import bio2bel_reactome +import bio2bel_wikipathways +from compath_utils import CompathManager +from pybel.cli import connection_option + + +def get_managers(connection: Optional[str] = None) -> Iterable[CompathManager]: + wikipathways_manager = bio2bel_wikipathways.Manager(connection=connection) + if not wikipathways_manager.is_populated(): + click.echo('WikiPathways is not populated') + else: + yield wikipathways_manager + + reactome_manager = bio2bel_reactome.Manager(connection=connection) + if not reactome_manager.is_populated(): + click.echo('Reactome is not populated') + else: + yield reactome_manager + + kegg_manager = bio2bel_kegg.Manager(connection=connection) + if not kegg_manager.is_populated(): + click.echo('KEGG is not populated') + else: + yield kegg_manager + + +@click.command() +@click.argument('name') +@click.option('--namespace', type=click.Choice('hgnc.symbol'), default='hgnc.symbol') +@connection_option +def main(name: str, namespace: str, connection: Optional[str]): + for manager in get_managers(connection): + if namespace == 'hgnc.symbol': + protein = manager.get_protein_by_hgnc_symbol(name) + else: + raise ValueError(f'{namespace} is not yet supported') + + if protein is None: + click.echo(f'No pathways in {manager.module_name}') + else: + for pathway in protein.pathways: + pathway_id = getattr(pathway, f'{manager.module_name}_id') + click.echo(f'{manager.module_name}:{pathway_id} ! {pathway}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/pubmed.py b/src/pybel_tools/curation/planning/pubmed.py new file mode 100644 index 00000000..028cb30e --- /dev/null +++ b/src/pybel_tools/curation/planning/pubmed.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for PubMed. + +Run with `python -m hbp.curation.planning.pubmed`. +""" + +import sys +from typing import List + +import click +from easy_config.contrib.click import args_from_config +from pybel_git.gitlab import GitlabConfig + +from ..utils import make_issues_from_pmids, min_year_option + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('-f', '--file', default=sys.stdin, type=click.File()) +@min_year_option +@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository') +@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC') +@click.option('-l', '--label', multiple=True) +def main(project_id: int, url: str, token: str, file, min_year: int, make_issues: bool, allow_closed: bool, + label: List[str]): + """Get a list of documents by their PubMed identifiers.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + + pmids = list(sorted({ + line.strip() + for line in file + })) + + make_issues_from_pmids( + project, + pmids, + min_year=min_year, + allow_closed=allow_closed, + make_issues=make_issues, + labels=label, + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/tag_has_pmc.py b/src/pybel_tools/curation/planning/tag_has_pmc.py new file mode 100644 index 00000000..84cf8623 --- /dev/null +++ b/src/pybel_tools/curation/planning/tag_has_pmc.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +"""A one-off script for updating availability tags in GitLab. + +This script gets all curation issues that are tagged as ``Availability: Missing``, looks up their +information in PubMed, and switches for ``Availability: PMC`` when appropriate. +""" + +from typing import Any, Mapping, Optional + +import click +from easy_config.contrib.click import args_from_config +from pybel.manager.citation_utils import get_pubmed_citation_response +from pybel_git.gitlab import GitlabConfig +from tqdm import tqdm + +from ..utils import get_issue_pdf, get_pmc_from_result, lstrip_list + +PMID_PREFIX = 'PMID:' +AVAILABILITY_PREFIX = 'Availability: ' +AVAILABILE_PDF = 'Availability: PDF' +AVAILABILE_PMC = 'Availability: PMC' +AVAILABILE_MISSING = 'Availability: Missing' + + +@click.command() +@args_from_config(GitlabConfig) +def main(project_id: int, url: str, token: str): + """Update the availability tags.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + issues = [ + issue + for issue in project.issues.list(all=True) + if 'Curation' in issue.labels + ] + + missing_dict = {} + for issue in issues: + if get_issue_pdf(issue.description): + if AVAILABILE_PDF not in issue.labels: + issue.labels.append(AVAILABILE_PDF) + issue.save() + continue + + pmid = get_pmid_from_title(issue.title) + + if pmid is None: + pmid = get_pmid_from_description(issue.description) + + if pmid is None: + print(f'MISSING PMID: {issue.title}\n{issue.description}\n{"=" * 80}\n\n') + continue + + availability = get_availability(issue) + + if availability is None: + issue.labels.append(AVAILABILE_MISSING) + issue.save() + availability = 'Missing' + + if availability == 'Missing': + if get_issue_pdf(issue.description): + print(f'PDF available for (pmid:{pmid}) / {issue.title}') + issue.labels.remove(AVAILABILE_MISSING) + issue.labels.append(AVAILABILE_PDF) + issue.save() + else: + print(f'Availability missing for (pmid:{pmid}) / {issue.title}') + missing_dict[pmid] = issue + + print(f'{len(missing_dict)} PMIDs were missing availabilities') + + pmid_to_pmc = get_availabilities_from_eutils(missing_dict) + + print(f'Found {len(pmid_to_pmc)} in PMC.') + it = tqdm(missing_dict.items(), desc='Updating issues') + for pmid, issue in it: + pmc = pmid_to_pmc.get(pmid) + if pmc is None: + continue + it.write(f'Updating availability on {issue.title}') + issue.labels.remove(AVAILABILE_MISSING) + issue.labels.append(AVAILABILE_PMC) + issue.discussions.create({ + 'body': f'Available from PubMed Central at https://identifiers.org/pmc:{pmc}' + }) + issue.save() + + +def get_availability(issue): + for label in issue.labels: + if label.startswith(AVAILABILITY_PREFIX): + return label[len(AVAILABILITY_PREFIX):] + + +def get_pmid_from_title(title) -> Optional[str]: + try: + index = title.index('pmid:') + except ValueError: + return None + else: + if 0 <= index: + return title[index + len('pmid:'):title.index(')', index)] + + +def get_pmid_from_description(description): + for line in description.splitlines(): + line = lstrip_list(line).replace('[', '') + if line.startswith(PMID_PREFIX): + if ']' in line: + return line[len(PMID_PREFIX) + 1:line.index(']')] + else: + return line[len(PMID_PREFIX):].strip() + + +def get_availabilities_from_eutils(pmids: Mapping[str, Any]) -> Mapping[str, str]: + rv = {} + results = get_pubmed_citation_response(pmids)['result'] + for pmid, issue in pmids.items(): + try: + result = results[pmid] + except KeyError: + print(f'Error on {issue.title}') + continue + pmc = get_pmc_from_result(result) + if pmc is not None: + rv[pmid] = pmc + return rv + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/__init__.py b/src/pybel_tools/curation/recuration/__init__.py new file mode 100644 index 00000000..8ca37835 --- /dev/null +++ b/src/pybel_tools/curation/recuration/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for planning re-curation.""" diff --git a/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py b/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py new file mode 100644 index 00000000..c5ecac11 --- /dev/null +++ b/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +"""A script that finds duplicate re-curation issues.""" + +from collections import defaultdict +from typing import List, Mapping + +import click +from easy_config.contrib.click import args_from_config +from gitlab.v4.objects import Issue, Project +from pybel_git.gitlab import GitlabConfig + +from .utils import RECURATION_ISSUE_NAME_PREFIX, RECURATION_LABEL + + +def get_recuration_issues(project) -> Mapping[str, List[Issue]]: + issues = defaultdict(list) + for recuration_issue in project.issues.list(all=True, labels=[RECURATION_LABEL]): + if recuration_issue.title.lower().startswith(RECURATION_ISSUE_NAME_PREFIX): + parts = recuration_issue.title[len(RECURATION_ISSUE_NAME_PREFIX):].split() + key = parts[0].strip().lower() + issues[key].append(recuration_issue) + return issues + + +@click.command() +@args_from_config(GitlabConfig) +def main(project_id: int, url: str, token: str): + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + _do_it(project) + + +def _do_it(project: Project) -> None: + for title, issues in get_recuration_issues(project).items(): + if len(issues) > 1: + print(title, issues) + for issue in issues: + print(f'https://gitlab.scai.fraunhofer.de/charles.hoyt/hbp/issues/{issue.get_id()}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/make_recuration_issues.py b/src/pybel_tools/curation/recuration/make_recuration_issues.py new file mode 100644 index 00000000..0cd6422d --- /dev/null +++ b/src/pybel_tools/curation/recuration/make_recuration_issues.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +"""A one-off script for making issues in GitLab for BEL graphs without re-curation. + +Run with: ``python -m hbp.curation.recuration.make_recuration_issues``. + +If you just want to test what would happen, run with ``--dry``. +""" + +import os + +import click +import hbp_knowledge +from easy_config.contrib.click import args_from_config +from pybel import BELGraph +from pybel.constants import ANNOTATIONS, RELATION, UNQUALIFIED_EDGES +from pybel_git.gitlab import GitlabConfig + +from .utils import ( + CURATION_LABEL, RECURATION_ISSUE_MISSING_DESCRIPTION, + RECURATION_ISSUE_NAME_PREFIX, RECURATION_LABEL, get_curation_issues, get_recuration_issues, +) + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('--dry', is_flag=True) +@click.option('-v', '--verbose', is_flag=True) +def main(project_id: int, url: str, token: str, dry: bool, verbose: bool): + """Reorganize re-curation issues.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + + click.echo('Getting re-curation issues') + recuration_issues = get_recuration_issues(project) + click.echo(f'Retrieved {len(recuration_issues)} re-curation issues labeled "{RECURATION_LABEL}"') + + click.echo('Getting curation issues') + original_issues = get_curation_issues(project) + click.echo(f'Retrieved {len(original_issues)} curation issues labeled "{CURATION_LABEL}"') + + overlap = set(recuration_issues).intersection(original_issues) + click.echo(f'Overlap: {len(overlap)}') + + for i in recuration_issues: + if i not in original_issues: + print(i) + for i in original_issues: + if i not in recuration_issues: + print(i) + + click.echo('Getting graphs') + graphs = hbp_knowledge.get_graphs() + click.echo(f'Got {len(graphs)} graphs') + + new_count, closed_count, done_count, pending_count = 0, 0, 0, 0 + for path, graph in graphs.items(): + recurated = graph_has_recuration(graph) + basename = os.path.basename(path)[:-len('.bel')] + recuration_issue = recuration_issues.get(basename) + + original_issue = original_issues.get(basename) + ss = '\n' + '\n- '.join(graph.summary_str().split('\n')) + if original_issue: + description = f'Original issue: #{original_issue.get_id()}\n{ss}' + else: + description = RECURATION_ISSUE_MISSING_DESCRIPTION + f'\n{ss}' + + if recuration_issue is None and recurated and verbose: + click.echo(f'✅ {basename} already re-curated') + + elif recuration_issue is None and not recurated: + title = make_issue_title(basename) + click.echo(f'❌ {basename} creating issue: {title}') + new_count += 1 + if not dry: + recuration_issue = project.issues.create({ + 'title': title, + 'description': description, + }) + recuration_issue.labels.append(RECURATION_LABEL) + recuration_issue.save() + + elif recuration_issue and recurated: + if recuration_issue.state == 'opened': + click.echo(f'🚪 {basename} closing issue {recuration_issue.title}') + closed_count += 1 + if not dry: + recuration_issue.state_event = 'close' + recuration_issue.save() + else: + if verbose: + click.echo(f'✅ {basename} already done') + done_count += 1 + + elif recuration_issue and not recurated: + click.echo(f'❌ {basename} not yet re-curated (#{recuration_issue.get_id()})') + pending_count += 1 + + if recuration_issue is not None and graph.description != description: + click.echo(f'{basename} updating description') + if not dry: + recuration_issue.description = description + recuration_issue.save() + + click.echo(f''' +Summary +=========== + New {new_count} + Closed {closed_count} +Pending {pending_count} + Done {done_count} +''') + + +def make_issue_title(name: str) -> str: + """Format the graph's name into an issue name.""" + return f'{RECURATION_ISSUE_NAME_PREFIX}{name}' + + +def graph_has_recuration(graph: BELGraph) -> bool: + """Check that all edges have re-curation.""" + return all( + ANNOTATIONS in data and 'Confidence' in data[ANNOTATIONS] + for _, _, data in graph.edges(data=True) + if data[RELATION] not in UNQUALIFIED_EDGES + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/utils.py b/src/pybel_tools/curation/recuration/utils.py new file mode 100644 index 00000000..c0d41d20 --- /dev/null +++ b/src/pybel_tools/curation/recuration/utils.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +"""Utility functions for re-curation tools.""" + +from typing import Mapping + +from gitlab.v4.objects import Issue + +CURATION_LABEL = 'Curation' +RECURATION_LABEL = 'Re-curation' +RECURATION_ISSUE_NAME_PREFIX = 're-curate '.lower() +CURATION_ISSUE_NAME_PREFIX = 'curate '.lower() +RECURATION_ISSUE_MISSING_DESCRIPTION = 'Missing original issue.' + + +def get_recuration_issues(project) -> Mapping[str, Issue]: + return _get_issue_prefixed(project, RECURATION_LABEL, RECURATION_ISSUE_NAME_PREFIX) + + +def get_curation_issues(project, all_issues: bool = True) -> Mapping[str, Issue]: + return _get_issue_prefixed(project, CURATION_LABEL, CURATION_ISSUE_NAME_PREFIX, all_issues=all_issues) + + +def _get_issue_prefixed(project, label, prefix, all_issues: bool = True): + issues = {} + for issue in project.issues.list(all=all_issues, labels=[label]): + if issue.title.lower().startswith(prefix): + title = issue.title[len(prefix):] + parts = title.split() + key = parts[0].strip().strip(':').lower() + issues[key] = issue + return issues diff --git a/src/pybel_tools/curation/reporting/__init__.py b/src/pybel_tools/curation/reporting/__init__.py new file mode 100644 index 00000000..a6069c92 --- /dev/null +++ b/src/pybel_tools/curation/reporting/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for reporting on curation.""" diff --git a/src/pybel_tools/curation/reporting/check_bad_names.py b/src/pybel_tools/curation/reporting/check_bad_names.py new file mode 100644 index 00000000..0eda7022 --- /dev/null +++ b/src/pybel_tools/curation/reporting/check_bad_names.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +"""This script checks which of the BEL graphs have unhelpful names.""" + +import re + +import click +from hbp_knowledge import get_graphs + +BAD_NAME = re.compile(r'.*\d{4}$') + + +@click.command() +def main(): + """Check which graphs' names end with a year.""" + for name, graph in get_graphs().items(): + if BAD_NAME.match(graph.name): + click.echo(f'{graph.name} {graph.path}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py b/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py new file mode 100644 index 00000000..c43a8d70 --- /dev/null +++ b/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +"""Summarize how much effort each curator has put.""" + +from collections import Counter, defaultdict +from typing import Mapping + +import click + +from pybel import BELGraph + + +def summarize_effort_per_graph(graphs: Mapping[str, BELGraph]): + """Summarize the effort per curator.""" + r = defaultdict(list) + for name, graph in graphs.items(): + if ',' in graph.authors: + first_author = graph.authors.split(',')[0].strip() + elif ' and ' in graph.authors: + i = graph.authors.find(' and ') + first_author = graph.authors[:i] + else: # sole author + first_author = graph.authors + + r[first_author].append(graph) + + max_author_width = max(map(len, r)) + + click.echo('= Author Graphs =') + graph_counter = Counter({ + author: len(graphs) + for author, graphs in r.items() + }) + for author, graph_count in graph_counter.most_common(): + click.echo(f'{author:{max_author_width}} {graph_count:>}') + + click.echo('\n= Author Edges =') + edge_counter = Counter({ + author: sum(graph.number_of_edges() for graph in graphs) + for author, graphs in r.items() + }) + for author, edge_count in edge_counter.most_common(): + click.echo(f'{author:{max_author_width}} {edge_count:>}') + + +@click.command() +def main(): + """Summarize the effort per curator.""" + import hbp_knowledge + + graphs: Mapping[str, BELGraph] = hbp_knowledge.get_graphs() + summarize_effort_per_graph(graphs) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/utils.py b/src/pybel_tools/curation/utils.py new file mode 100644 index 00000000..d3e5fb3c --- /dev/null +++ b/src/pybel_tools/curation/utils.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +"""Curation utilities.""" + +from typing import Iterable, List, Optional + +import click +from gitlab.v4.objects import Project + +from pybel.manager.citation_utils import get_pubmed_citation_response + +min_year_option = click.option( + '--min-year', type=int, default=2001, show_default=True, + help='Minimum publication year. Before 2001 is dubious.', +) + + +def make_issues_from_pmids( + project: Project, + pmids: Iterable[str], + min_year: Optional[int] = None, + allow_closed: bool = False, + make_issues: bool = False, + labels: Optional[List[str]] = None, +) -> None: + """Make issues on the GitLab project for the given articles.""" + results = get_pubmed_citation_response(pmids)['result'] + existing_issue_titles = { + issue.title + for issue in project.issues.list(all=True) + } + + for pmid in pmids: + result = results[pmid] + first_author_surname = result['sortfirstauthor'].split()[0].lower() + pubyear = int(result['sortpubdate'].split('/')[0]) + if min_year is not None and pubyear < min_year: + continue + + pmc = get_pmc_from_result(result) + if pmc is not None: + issue_title = f'''{first_author_surname}{pubyear} (pmid:{pmid}, pmc:{pmc}) "{result['title']}"''' + elif allow_closed: + issue_title = f'''{first_author_surname}{pubyear} (pmid:{pmid}) "{result['title']}"''' + else: + continue + + if make_issues and issue_title not in existing_issue_titles: + project_issue = project.issues.create({ + 'title': f'Curate {issue_title}', + # 'description': 'Something useful here.' + }) + if pmc is not None: + project_issue.labels.append('Availability: PMC') + project_issue.labels = ['Curation'] + if labels is not None: + project_issue.labels.extend(labels) + project_issue.save() + + +def get_pmc_from_result(result) -> Optional[str]: + for article_id in result['articleids']: + if article_id['idtype'] == 'pmc': + return article_id['value'] + + +PDF_PREFIX = 'PDF:' +DOWNLOAD_PREFIX = 'Download:' + + +def get_issue_pdf(description: str) -> str: + """Get the link to the PDF for a given issue.""" + for line in description.splitlines(): + line = lstrip_list(line) + if line.startswith(PDF_PREFIX) and len(line) > len(PDF_PREFIX): + return line[len(PDF_PREFIX):] + if line.startswith(DOWNLOAD_PREFIX) and len(line) > len(DOWNLOAD_PREFIX): + return line[len(DOWNLOAD_PREFIX):] + + +def lstrip_list(s: str) -> str: + """Left strip a string of its bullet point.""" + return s.strip().lstrip('-').lstrip() diff --git a/src/pybel_tools/normalize.py b/src/pybel_tools/normalize.py new file mode 100644 index 00000000..69c794ad --- /dev/null +++ b/src/pybel_tools/normalize.py @@ -0,0 +1,180 @@ +# -*- coding: utf-8 -*- + +"""Utilities to help with normalizing entities in BEL graphs.""" + +import logging +from collections import Counter +from itertools import chain +from typing import Iterable, List, Mapping, Optional, TextIO, Tuple + +from pybel import BELGraph +from pybel.constants import IDENTIFIER, NAME, NAMESPACE +from pybel.dsl import ( + BaseEntity, CentralDogma, FusionBase, GeneModification, ListAbundance, + ProteinModification, Reaction, +) + +from pybel_tools.utils import group_as_lists + +__all__ = [ + 'normalize', + 'get_unnormalized', + 'summarize_unnormalized', +] + +logger = logging.getLogger(__name__) + + +def normalize(graph: BELGraph, use_tqdm: bool = True) -> None: + """Normalize all of the entities in the graph.""" + logger.info('normalizing HGNC') + import bio2bel_hgnc + hgnc_manager = bio2bel_hgnc.Manager() + hgnc_manager.normalize_genes(graph, use_tqdm=use_tqdm) + + # logger.info('normalizing HGNC Gene Families') + # gfam_manager = bio2bel_hgnc.FamilyManager() + # gfam_manager.normalize_families(graph) + + logger.info('normalizing MeSH') + import bio2bel_mesh + mesh_manager = bio2bel_mesh.Manager() + if mesh_manager.is_populated(): + mesh_manager.normalize_terms(graph, use_tqdm=use_tqdm) + else: + logger.warning('MeSH has not been populated') + + logger.info('normalizing FamPlex') + import bio2bel_famplex + famplex_manager = bio2bel_famplex.Manager() + famplex_manager.normalize_terms(graph, use_tqdm=use_tqdm) + + logger.info('normalizing GO') + import bio2bel_go + go_manager = bio2bel_go.Manager() + go_manager.normalize_terms(graph, use_tqdm=use_tqdm) + + logger.info('normalizing CONSO') + import conso.manager + conso_manager = conso.manager.Manager() + conso_manager.normalize_terms(graph, use_tqdm=use_tqdm) + + logger.info('normalizing ChEBI') + import bio2bel_chebi + chebi_manager = bio2bel_chebi.Manager() + if chebi_manager.is_populated(): + chebi_manager.normalize_chemicals(graph, use_tqdm=use_tqdm) + else: + logger.warning('ChEBI has not been populated') + + logger.info('normalizing MGI') + import bio2bel_mgi + mgi_manager = bio2bel_mgi.Manager() + if mgi_manager.is_populated(): + mgi_manager.normalize_mouse_genes(graph, use_tqdm=use_tqdm) + else: + logger.warning('MGI has not been populated') + + logger.info('normalizing RGD') + import bio2bel_rgd + rgd_manager = bio2bel_rgd.Manager() + if rgd_manager.is_populated(): + rgd_manager.normalize_rat_genes(graph, use_tqdm=use_tqdm) + else: + logger.warning('RGD has not been populated') + + # logger.info('normalizing InterPro') + + # logger.info('normalizing PFAM') + + logger.info('normalizing Entrez Gene') + import bio2bel_entrez + entrez_manager = bio2bel_entrez.Manager() + entrez_manager.normalize_genes(graph, use_tqdm=use_tqdm) + + logger.info('normalizing UniProt') + import bio2bel_uniprot + uniprot_manager = bio2bel_uniprot.Manager() + uniprot_manager.normalize_terms(graph, use_tqdm=use_tqdm) + + logger.info('normalizing DrugBank') + import bio2bel_drugbank + drugbank_manager = bio2bel_drugbank.Manager() + if drugbank_manager.is_populated(): + drugbank_manager.normalize_drugs(graph, use_tqdm=use_tqdm) + else: + logger.warning('DrugBank has not been populated') + + logger.info('normalizing miRBase') + import bio2bel_mirbase + mirbase_manager = bio2bel_mirbase.Manager() + if mirbase_manager.is_populated(): + mirbase_manager.normalize_terms(graph, use_tqdm=use_tqdm) + else: + logger.warning('miRBase has not been populated') + + # TODO deal with OBO-based bio2bel + # logger.info('normalizing CL') + # logger.info('normalizing HP') + + +def summarize_unnormalized(graph: BELGraph, file: Optional[TextIO] = None) -> None: + for namespace, names in get_unnormalized(graph).items(): + name_counter = Counter(names) + print( + namespace, + len(names), + *[x for x, _ in name_counter.most_common(3)], + file=file, + ) + + +def get_unnormalized(graph: BELGraph) -> Mapping[str, List[str]]: + """Get a mapping of namespaces to their unnormalized names.""" + return group_as_lists(iter_unnormalize_entity_namespaces(graph)) + + +def iter_unnormalize_entity_namespaces(graph: BELGraph) -> Iterable[Tuple[str, str]]: + """Get the namespaces that haven't been normalized.""" + for node in graph: + yield from _iter_unnormalized_node(node) + + +def _iter_unnormalized_node(node: BaseEntity) -> Iterable[Tuple[str, str]]: + namespace, name, identifier = node.get(NAMESPACE), node.get(NAME), node.get(IDENTIFIER) + + if not namespace: + pass + + elif name and identifier: + pass + + elif name and not identifier: + yield namespace, name + + elif not name and identifier: + yield namespace, identifier + + elif isinstance(node, FusionBase): + yield from _iter_unnormalized_node(node.partner_5p) + yield from _iter_unnormalized_node(node.partner_3p) + + elif isinstance(node, CentralDogma) and node.variants: + for variant in node.variants: + if isinstance(variant, (GeneModification, ProteinModification)): + namespace = variant.entity.get(NAMESPACE) + name = variant.entity.get(NAME) + identifier = variant.entity.get(IDENTIFIER) + if namespace and not identifier: + yield namespace, name + + elif isinstance(node, ListAbundance): + for member in node.members: + yield from _iter_unnormalized_node(member) + + elif isinstance(node, Reaction): + for member in chain(node.reactants, node.products): + yield from _iter_unnormalized_node(member) + + else: + logger.warning('Unhandled node: %r', node) From 9fc07987ec8b7e2f56bd71e993eaeeef98316712 Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Thu, 14 May 2020 12:29:39 +0200 Subject: [PATCH 2/2] Remove deprecated --- src/pybel_tools/chem.py | 78 ---------- src/pybel_tools/citation_coocurrence.py | 12 +- src/pybel_tools/normalize.py | 180 ------------------------ 3 files changed, 6 insertions(+), 264 deletions(-) delete mode 100644 src/pybel_tools/chem.py delete mode 100644 src/pybel_tools/normalize.py diff --git a/src/pybel_tools/chem.py b/src/pybel_tools/chem.py deleted file mode 100644 index c4d95199..00000000 --- a/src/pybel_tools/chem.py +++ /dev/null @@ -1,78 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Chemistry tools for BEL.""" - -import itertools as itt -from typing import Iterable, Tuple - -from rdkit import DataStructs -from rdkit.Chem import MACCSkeys, MolFromInchi -from tqdm import tqdm - -from pybel import BELGraph -from pybel.constants import ANNOTATIONS, IDENTIFIER, NAMESPACE -from pybel.dsl import BaseEntity -from pybel.struct import invert_node_predicate, node_predicate, remove_filtered_nodes - -__all__ = [ - 'add_similarity_edges', - 'remove_non_inchi', -] - - -def iter_inchi_nodes(graph: BELGraph) -> Iterable[Tuple[tuple, dict, str]]: - """Iterate over node tuple, node data, and InChI string triples.""" - for node, data in graph.nodes(data=True): - if node_is_inchi(data): - yield node, data, data.get(IDENTIFIER) - - -@node_predicate -def node_is_inchi(node: BaseEntity): - return 'inchi' == node.get(NAMESPACE) and IDENTIFIER in node - - -def remove_non_inchi(graph: BELGraph): - """Remove all non-inchi nodes.""" - remove_filtered_nodes(graph, invert_node_predicate(node_is_inchi)) - - -def add_similarity_edges(graph: BELGraph, cutoff: float = 0.8) -> None: - """Enrich a BEL graph with edges between chemicals that have InChI. - - :param graph: A BEL graph - :param cutoff: The cutoff for similarity - """ - inchi_to_node_tuple = { - inchi: node_tuple - for node_tuple, _, inchi in iter_inchi_nodes(graph) - } - - mols = { - inchi: MolFromInchi(inchi) - for inchi in inchi_to_node_tuple - } - - fps = { - inchi: MACCSkeys.GenMACCSKeys(mol) - for inchi, mol in tqdm(mols.items(), desc='calculating MACCS keys') - if mol is not None - } - - n_combinations = (len(fps) * (len(fps) - 1) / 2) - _sim_iter = ( - (x, y, DataStructs.FingerprintSimilarity(fps[x], fps[y])) - # can also use FingerprintMols - for x, y in tqdm(itt.combinations(fps, 2), total=n_combinations, desc='calculating similarity') - ) - - for x, y, sim in _sim_iter: - if sim < cutoff: - continue - - source, target = inchi_to_node_tuple[x], inchi_to_node_tuple[y] - - key = graph.add_unqualified_edge(source, target, relation='similar') - graph[source][target][key][ANNOTATIONS] = { - 'similarity': sim - } diff --git a/src/pybel_tools/citation_coocurrence.py b/src/pybel_tools/citation_coocurrence.py index 7b159eb8..a538ad14 100644 --- a/src/pybel_tools/citation_coocurrence.py +++ b/src/pybel_tools/citation_coocurrence.py @@ -2,17 +2,17 @@ """Build a network of citations connected by co-occurrence of entities.""" -import itertools as itt from collections import Counter, defaultdict -from typing import TextIO import click +import itertools as itt import networkx as nx from tqdm import tqdm +from typing import TextIO from pybel import BELGraph, Manager from pybel.cli import connection_option, graph_pickle_argument -from pybel.constants import CITATION, CITATION_REFERENCE, CITATION_TITLE, CITATION_TYPE, CITATION_TYPE_PUBMED +from pybel.constants import CITATION, CITATION_DB, CITATION_DB_NAME, CITATION_IDENTIFIER, CITATION_TYPE_PUBMED from pybel.manager.citation_utils import enrich_pubmed_citations @@ -44,11 +44,11 @@ def make_citation_network(bel_graph: BELGraph, threshold: int = 0) -> nx.Graph: names = {} for u, v, k, d in bel_graph.edges(keys=True, data=True): citation = d.get(CITATION) - if citation is None or citation[CITATION_TYPE] != CITATION_TYPE_PUBMED: + if citation is None or citation[CITATION_DB] != CITATION_TYPE_PUBMED: continue - reference = citation[CITATION_REFERENCE] + reference = citation[CITATION_IDENTIFIER] dd[reference].update((u, v)) - names[reference] = citation.get(CITATION_TITLE) + names[reference] = citation.get(CITATION_DB_NAME) all_nodes = set(itt.chain.from_iterable(dd.values())) diff --git a/src/pybel_tools/normalize.py b/src/pybel_tools/normalize.py deleted file mode 100644 index 69c794ad..00000000 --- a/src/pybel_tools/normalize.py +++ /dev/null @@ -1,180 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Utilities to help with normalizing entities in BEL graphs.""" - -import logging -from collections import Counter -from itertools import chain -from typing import Iterable, List, Mapping, Optional, TextIO, Tuple - -from pybel import BELGraph -from pybel.constants import IDENTIFIER, NAME, NAMESPACE -from pybel.dsl import ( - BaseEntity, CentralDogma, FusionBase, GeneModification, ListAbundance, - ProteinModification, Reaction, -) - -from pybel_tools.utils import group_as_lists - -__all__ = [ - 'normalize', - 'get_unnormalized', - 'summarize_unnormalized', -] - -logger = logging.getLogger(__name__) - - -def normalize(graph: BELGraph, use_tqdm: bool = True) -> None: - """Normalize all of the entities in the graph.""" - logger.info('normalizing HGNC') - import bio2bel_hgnc - hgnc_manager = bio2bel_hgnc.Manager() - hgnc_manager.normalize_genes(graph, use_tqdm=use_tqdm) - - # logger.info('normalizing HGNC Gene Families') - # gfam_manager = bio2bel_hgnc.FamilyManager() - # gfam_manager.normalize_families(graph) - - logger.info('normalizing MeSH') - import bio2bel_mesh - mesh_manager = bio2bel_mesh.Manager() - if mesh_manager.is_populated(): - mesh_manager.normalize_terms(graph, use_tqdm=use_tqdm) - else: - logger.warning('MeSH has not been populated') - - logger.info('normalizing FamPlex') - import bio2bel_famplex - famplex_manager = bio2bel_famplex.Manager() - famplex_manager.normalize_terms(graph, use_tqdm=use_tqdm) - - logger.info('normalizing GO') - import bio2bel_go - go_manager = bio2bel_go.Manager() - go_manager.normalize_terms(graph, use_tqdm=use_tqdm) - - logger.info('normalizing CONSO') - import conso.manager - conso_manager = conso.manager.Manager() - conso_manager.normalize_terms(graph, use_tqdm=use_tqdm) - - logger.info('normalizing ChEBI') - import bio2bel_chebi - chebi_manager = bio2bel_chebi.Manager() - if chebi_manager.is_populated(): - chebi_manager.normalize_chemicals(graph, use_tqdm=use_tqdm) - else: - logger.warning('ChEBI has not been populated') - - logger.info('normalizing MGI') - import bio2bel_mgi - mgi_manager = bio2bel_mgi.Manager() - if mgi_manager.is_populated(): - mgi_manager.normalize_mouse_genes(graph, use_tqdm=use_tqdm) - else: - logger.warning('MGI has not been populated') - - logger.info('normalizing RGD') - import bio2bel_rgd - rgd_manager = bio2bel_rgd.Manager() - if rgd_manager.is_populated(): - rgd_manager.normalize_rat_genes(graph, use_tqdm=use_tqdm) - else: - logger.warning('RGD has not been populated') - - # logger.info('normalizing InterPro') - - # logger.info('normalizing PFAM') - - logger.info('normalizing Entrez Gene') - import bio2bel_entrez - entrez_manager = bio2bel_entrez.Manager() - entrez_manager.normalize_genes(graph, use_tqdm=use_tqdm) - - logger.info('normalizing UniProt') - import bio2bel_uniprot - uniprot_manager = bio2bel_uniprot.Manager() - uniprot_manager.normalize_terms(graph, use_tqdm=use_tqdm) - - logger.info('normalizing DrugBank') - import bio2bel_drugbank - drugbank_manager = bio2bel_drugbank.Manager() - if drugbank_manager.is_populated(): - drugbank_manager.normalize_drugs(graph, use_tqdm=use_tqdm) - else: - logger.warning('DrugBank has not been populated') - - logger.info('normalizing miRBase') - import bio2bel_mirbase - mirbase_manager = bio2bel_mirbase.Manager() - if mirbase_manager.is_populated(): - mirbase_manager.normalize_terms(graph, use_tqdm=use_tqdm) - else: - logger.warning('miRBase has not been populated') - - # TODO deal with OBO-based bio2bel - # logger.info('normalizing CL') - # logger.info('normalizing HP') - - -def summarize_unnormalized(graph: BELGraph, file: Optional[TextIO] = None) -> None: - for namespace, names in get_unnormalized(graph).items(): - name_counter = Counter(names) - print( - namespace, - len(names), - *[x for x, _ in name_counter.most_common(3)], - file=file, - ) - - -def get_unnormalized(graph: BELGraph) -> Mapping[str, List[str]]: - """Get a mapping of namespaces to their unnormalized names.""" - return group_as_lists(iter_unnormalize_entity_namespaces(graph)) - - -def iter_unnormalize_entity_namespaces(graph: BELGraph) -> Iterable[Tuple[str, str]]: - """Get the namespaces that haven't been normalized.""" - for node in graph: - yield from _iter_unnormalized_node(node) - - -def _iter_unnormalized_node(node: BaseEntity) -> Iterable[Tuple[str, str]]: - namespace, name, identifier = node.get(NAMESPACE), node.get(NAME), node.get(IDENTIFIER) - - if not namespace: - pass - - elif name and identifier: - pass - - elif name and not identifier: - yield namespace, name - - elif not name and identifier: - yield namespace, identifier - - elif isinstance(node, FusionBase): - yield from _iter_unnormalized_node(node.partner_5p) - yield from _iter_unnormalized_node(node.partner_3p) - - elif isinstance(node, CentralDogma) and node.variants: - for variant in node.variants: - if isinstance(variant, (GeneModification, ProteinModification)): - namespace = variant.entity.get(NAMESPACE) - name = variant.entity.get(NAME) - identifier = variant.entity.get(IDENTIFIER) - if namespace and not identifier: - yield namespace, name - - elif isinstance(node, ListAbundance): - for member in node.members: - yield from _iter_unnormalized_node(member) - - elif isinstance(node, Reaction): - for member in chain(node.reactants, node.products): - yield from _iter_unnormalized_node(member) - - else: - logger.warning('Unhandled node: %r', node)