diff --git a/src/pybel_tools/citation_coocurrence.py b/src/pybel_tools/citation_coocurrence.py new file mode 100644 index 00000000..a538ad14 --- /dev/null +++ b/src/pybel_tools/citation_coocurrence.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- + +"""Build a network of citations connected by co-occurrence of entities.""" + +from collections import Counter, defaultdict + +import click +import itertools as itt +import networkx as nx +from tqdm import tqdm +from typing import TextIO + +from pybel import BELGraph, Manager +from pybel.cli import connection_option, graph_pickle_argument +from pybel.constants import CITATION, CITATION_DB, CITATION_DB_NAME, CITATION_IDENTIFIER, CITATION_TYPE_PUBMED +from pybel.manager.citation_utils import enrich_pubmed_citations + + +@click.command() +@connection_option +@graph_pickle_argument +@click.option('-o', '--output', type=click.File('w'), required=True) +@click.option('-t', '--threshold', type=int, default=1) +def main(connection: str, graph: BELGraph, output: TextIO, threshold): + """Build a citation network from the graph.""" + enrich_pubmed_citations(Manager(connection=connection), graph) + citation_network = make_citation_network(graph, threshold=threshold) + print('Source', 'Source Title', 'Target', 'Target Title', 'Shared', sep='\t', file=output) + for u, v, d in citation_network.edges(data=True): + print( + u, + citation_network.nodes[u]['title'], + v, + citation_network.nodes[v]['title'], + d['weight'], + sep='\t', + file=output, + ) + + +def make_citation_network(bel_graph: BELGraph, threshold: int = 0) -> nx.Graph: + """Make a citation network from the BEL graph based on which statements occur in multiple sourves.""" + dd = defaultdict(set) + names = {} + for u, v, k, d in bel_graph.edges(keys=True, data=True): + citation = d.get(CITATION) + if citation is None or citation[CITATION_DB] != CITATION_TYPE_PUBMED: + continue + reference = citation[CITATION_IDENTIFIER] + dd[reference].update((u, v)) + names[reference] = citation.get(CITATION_DB_NAME) + + all_nodes = set(itt.chain.from_iterable(dd.values())) + + iterator = itt.product(all_nodes, itt.combinations(dd.items(), r=2)) + iterator = tqdm(iterator, total=len(all_nodes) * (len(dd) ** 2)) + c = Counter( + (c1, c2) + for node, ((c1, c1_values), (c2, c2_values)) in iterator + if node in c1_values and node in c2_values + ) + + rv = nx.Graph() + for (c1, c2), weight in c.items(): + if weight >= threshold: + rv.add_edge(c1, c2, weight=weight) + + for reference, title in names.items(): + rv.nodes[reference]['title'] = title + + return rv + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/__init__.py b/src/pybel_tools/curation/__init__.py new file mode 100644 index 00000000..7fa123d9 --- /dev/null +++ b/src/pybel_tools/curation/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for curation.""" diff --git a/src/pybel_tools/curation/planning/__init__.py b/src/pybel_tools/curation/planning/__init__.py new file mode 100644 index 00000000..fd74c504 --- /dev/null +++ b/src/pybel_tools/curation/planning/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for planning curation.""" diff --git a/src/pybel_tools/curation/planning/check_novelties.py b/src/pybel_tools/curation/planning/check_novelties.py new file mode 100644 index 00000000..6aea4bff --- /dev/null +++ b/src/pybel_tools/curation/planning/check_novelties.py @@ -0,0 +1,82 @@ +# -*- coding: utf-8 -*- + +"""This script assesses the novelty of pending curation tasks. + +Currently, is limited to articles where PMC is available to ensure +good INDRA coverage. +""" + +import json +import logging +from typing import Optional + +import click +from easy_config.contrib.click import args_from_config +from gitlab.v4.objects import Issue, Project +from hbp_knowledge import get_graph +from pybel_git.gitlab import GitlabConfig + +from pybel import BELGraph +from pybel_tools.assess_completeness import CompletenessSummary, assess_completeness +from ..recuration.utils import CURATION_LABEL + +_prefix = '- PMID: [' + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('-o', '--output', type=click.File('w')) +def main(project_id: int, url: str, token: str, output) -> None: + """Assess the completeness of HBP curation tasks with respect to CONIB.""" + logging.basicConfig(level=logging.INFO) + logging.getLogger('hbp').setLevel(logging.INFO) + + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + do_it(project, output) + + +def do_it(project: Project, output): + graph = get_graph() + + summaries = assess_project_completeness(project=project, graph=graph) + + if output is not None: + json.dump(list(summaries), output, indent=2) + else: + for summary in summaries: + click.echo(json.dumps(summary, indent=2)) + + +def assess_project_completeness(*, project: Project, graph: BELGraph): + """Summarize thee novelty of all issues in the project.""" + issues = project.issues.list(labels=[CURATION_LABEL]) + for issue in issues: + click.echo(f'Issue {issue.id}: {issue.title}') + s = assess_issue_completeness(issue=issue, graph=graph) + d = s.summary_dict() + yield d + + +def assess_issue_completeness(*, issue: Issue, graph: BELGraph) -> CompletenessSummary: + """Summarize the novelty of the PMID referenced by the issue.""" + pmid = _get_pmid(issue.description) + ids = ('pmid', pmid) + return assess_completeness(ids, graph) + + +def _get_pmid(description: str) -> Optional[str]: + for line in description.split('\n'): + line = line.strip() + if line.startswith(_prefix): + line: str = line[len(_prefix):] + line = line[:line.index(']')] + return line + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/go.py b/src/pybel_tools/curation/planning/go.py new file mode 100644 index 00000000..7741e944 --- /dev/null +++ b/src/pybel_tools/curation/planning/go.py @@ -0,0 +1,86 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for the Gene Ontology (GO). + +Run with `python -m hbp.curation.planning.go`. +""" + +from typing import List + +import click +import requests + +from ..utils import make_issues_from_pmids, min_year_option + +url = 'http://golr-aux.geneontology.io/solr/select' + +BASE_PARAMS = { + 'defType': ['edismax'], + 'qt': ['standard'], + 'indent': ['on'], + 'wt': ['csv'], + 'rows': ['100000'], + 'start': ['0'], 'fl': ['reference'], + 'facet': ['true'], + 'facet.mincount': ['1'], + 'facet.sort': ['count'], + 'json.nl': ['arrarr'], + 'facet.limit': ['25'], + 'hl': ['true'], + 'hl.simple.pre': [''], + 'hl.snippets': ['1000'], + 'csv.separator': ['\t'], + 'csv.header': ['false'], + 'csv.mv.separator': ['|'], + 'fq': ['document_category:"annotation"'], # add bioentity here too + 'facet.field': ['aspect', 'taxon_subset_closure_label', 'type', 'evidence_subset_closure_label', + 'regulates_closure_label', 'annotation_class_label', 'qualifier', + 'annotation_extension_class_closure_label', 'assigned_by', 'panther_family_label'], + 'q': ['*:*'], +} + + +def get_pmids_from_go_annotations_by_uniprot_id(uniprot_id: str) -> List[str]: + """Get the PubMed identifiers used in GO annotations for the given protein.""" + params = BASE_PARAMS.copy() + params['fq'].append(f'bioentity:"UniProtKB:{uniprot_id}"') + r = requests.get(url, params) + lines = ( + line.strip() + for line in r.text.splitlines() + ) + return list(sorted({ + line.split(':')[1] + for line in lines + if line and line.lower().startswith('pmid') + })) + + +@click.command() +@click.argument('uniprot_id') +@click.option('--namespace', type=click.Choice(['uniprot']), default='uniprot') +@min_year_option +@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository') +@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC') +@click.option('-l', '--label', multiple=True) +def main(uniprot_id: str, namespace: str, min_year: int, make_issues: bool, allow_closed: bool, label: List[str]): + """Get a list of documents for the given UniProt identifier. + + Example: Q13148. + """ + if namespace == 'uniprot': + pmids = get_pmids_from_go_annotations_by_uniprot_id(uniprot_id) + else: + raise ValueError(f'{namespace} is not yet supported') + + make_issues_from_pmids( + pmids, + min_year=min_year, + allow_closed=allow_closed, + make_issues=make_issues, + labels=label, + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/pathways.py b/src/pybel_tools/curation/planning/pathways.py new file mode 100644 index 00000000..d9db35e9 --- /dev/null +++ b/src/pybel_tools/curation/planning/pathways.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for the Gene Ontology (GO). + +Run with `python -m hbp.curation.planning.pathways`. +""" + +from typing import Iterable, Optional + +import click + +import bio2bel_kegg +import bio2bel_reactome +import bio2bel_wikipathways +from compath_utils import CompathManager +from pybel.cli import connection_option + + +def get_managers(connection: Optional[str] = None) -> Iterable[CompathManager]: + wikipathways_manager = bio2bel_wikipathways.Manager(connection=connection) + if not wikipathways_manager.is_populated(): + click.echo('WikiPathways is not populated') + else: + yield wikipathways_manager + + reactome_manager = bio2bel_reactome.Manager(connection=connection) + if not reactome_manager.is_populated(): + click.echo('Reactome is not populated') + else: + yield reactome_manager + + kegg_manager = bio2bel_kegg.Manager(connection=connection) + if not kegg_manager.is_populated(): + click.echo('KEGG is not populated') + else: + yield kegg_manager + + +@click.command() +@click.argument('name') +@click.option('--namespace', type=click.Choice('hgnc.symbol'), default='hgnc.symbol') +@connection_option +def main(name: str, namespace: str, connection: Optional[str]): + for manager in get_managers(connection): + if namespace == 'hgnc.symbol': + protein = manager.get_protein_by_hgnc_symbol(name) + else: + raise ValueError(f'{namespace} is not yet supported') + + if protein is None: + click.echo(f'No pathways in {manager.module_name}') + else: + for pathway in protein.pathways: + pathway_id = getattr(pathway, f'{manager.module_name}_id') + click.echo(f'{manager.module_name}:{pathway_id} ! {pathway}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/pubmed.py b/src/pybel_tools/curation/planning/pubmed.py new file mode 100644 index 00000000..028cb30e --- /dev/null +++ b/src/pybel_tools/curation/planning/pubmed.py @@ -0,0 +1,51 @@ +# -*- coding: utf-8 -*- + +"""Curation tools for PubMed. + +Run with `python -m hbp.curation.planning.pubmed`. +""" + +import sys +from typing import List + +import click +from easy_config.contrib.click import args_from_config +from pybel_git.gitlab import GitlabConfig + +from ..utils import make_issues_from_pmids, min_year_option + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('-f', '--file', default=sys.stdin, type=click.File()) +@min_year_option +@click.option('--make-issues', is_flag=True, help='Create issues on GitLab HBP repository') +@click.option('--allow-closed', is_flag=True, help='Allow publications that are not on PMC') +@click.option('-l', '--label', multiple=True) +def main(project_id: int, url: str, token: str, file, min_year: int, make_issues: bool, allow_closed: bool, + label: List[str]): + """Get a list of documents by their PubMed identifiers.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + + pmids = list(sorted({ + line.strip() + for line in file + })) + + make_issues_from_pmids( + project, + pmids, + min_year=min_year, + allow_closed=allow_closed, + make_issues=make_issues, + labels=label, + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/planning/tag_has_pmc.py b/src/pybel_tools/curation/planning/tag_has_pmc.py new file mode 100644 index 00000000..84cf8623 --- /dev/null +++ b/src/pybel_tools/curation/planning/tag_has_pmc.py @@ -0,0 +1,137 @@ +# -*- coding: utf-8 -*- + +"""A one-off script for updating availability tags in GitLab. + +This script gets all curation issues that are tagged as ``Availability: Missing``, looks up their +information in PubMed, and switches for ``Availability: PMC`` when appropriate. +""" + +from typing import Any, Mapping, Optional + +import click +from easy_config.contrib.click import args_from_config +from pybel.manager.citation_utils import get_pubmed_citation_response +from pybel_git.gitlab import GitlabConfig +from tqdm import tqdm + +from ..utils import get_issue_pdf, get_pmc_from_result, lstrip_list + +PMID_PREFIX = 'PMID:' +AVAILABILITY_PREFIX = 'Availability: ' +AVAILABILE_PDF = 'Availability: PDF' +AVAILABILE_PMC = 'Availability: PMC' +AVAILABILE_MISSING = 'Availability: Missing' + + +@click.command() +@args_from_config(GitlabConfig) +def main(project_id: int, url: str, token: str): + """Update the availability tags.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + issues = [ + issue + for issue in project.issues.list(all=True) + if 'Curation' in issue.labels + ] + + missing_dict = {} + for issue in issues: + if get_issue_pdf(issue.description): + if AVAILABILE_PDF not in issue.labels: + issue.labels.append(AVAILABILE_PDF) + issue.save() + continue + + pmid = get_pmid_from_title(issue.title) + + if pmid is None: + pmid = get_pmid_from_description(issue.description) + + if pmid is None: + print(f'MISSING PMID: {issue.title}\n{issue.description}\n{"=" * 80}\n\n') + continue + + availability = get_availability(issue) + + if availability is None: + issue.labels.append(AVAILABILE_MISSING) + issue.save() + availability = 'Missing' + + if availability == 'Missing': + if get_issue_pdf(issue.description): + print(f'PDF available for (pmid:{pmid}) / {issue.title}') + issue.labels.remove(AVAILABILE_MISSING) + issue.labels.append(AVAILABILE_PDF) + issue.save() + else: + print(f'Availability missing for (pmid:{pmid}) / {issue.title}') + missing_dict[pmid] = issue + + print(f'{len(missing_dict)} PMIDs were missing availabilities') + + pmid_to_pmc = get_availabilities_from_eutils(missing_dict) + + print(f'Found {len(pmid_to_pmc)} in PMC.') + it = tqdm(missing_dict.items(), desc='Updating issues') + for pmid, issue in it: + pmc = pmid_to_pmc.get(pmid) + if pmc is None: + continue + it.write(f'Updating availability on {issue.title}') + issue.labels.remove(AVAILABILE_MISSING) + issue.labels.append(AVAILABILE_PMC) + issue.discussions.create({ + 'body': f'Available from PubMed Central at https://identifiers.org/pmc:{pmc}' + }) + issue.save() + + +def get_availability(issue): + for label in issue.labels: + if label.startswith(AVAILABILITY_PREFIX): + return label[len(AVAILABILITY_PREFIX):] + + +def get_pmid_from_title(title) -> Optional[str]: + try: + index = title.index('pmid:') + except ValueError: + return None + else: + if 0 <= index: + return title[index + len('pmid:'):title.index(')', index)] + + +def get_pmid_from_description(description): + for line in description.splitlines(): + line = lstrip_list(line).replace('[', '') + if line.startswith(PMID_PREFIX): + if ']' in line: + return line[len(PMID_PREFIX) + 1:line.index(']')] + else: + return line[len(PMID_PREFIX):].strip() + + +def get_availabilities_from_eutils(pmids: Mapping[str, Any]) -> Mapping[str, str]: + rv = {} + results = get_pubmed_citation_response(pmids)['result'] + for pmid, issue in pmids.items(): + try: + result = results[pmid] + except KeyError: + print(f'Error on {issue.title}') + continue + pmc = get_pmc_from_result(result) + if pmc is not None: + rv[pmid] = pmc + return rv + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/__init__.py b/src/pybel_tools/curation/recuration/__init__.py new file mode 100644 index 00000000..8ca37835 --- /dev/null +++ b/src/pybel_tools/curation/recuration/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for planning re-curation.""" diff --git a/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py b/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py new file mode 100644 index 00000000..c5ecac11 --- /dev/null +++ b/src/pybel_tools/curation/recuration/find_duplicate_recuration_issues.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- + +"""A script that finds duplicate re-curation issues.""" + +from collections import defaultdict +from typing import List, Mapping + +import click +from easy_config.contrib.click import args_from_config +from gitlab.v4.objects import Issue, Project +from pybel_git.gitlab import GitlabConfig + +from .utils import RECURATION_ISSUE_NAME_PREFIX, RECURATION_LABEL + + +def get_recuration_issues(project) -> Mapping[str, List[Issue]]: + issues = defaultdict(list) + for recuration_issue in project.issues.list(all=True, labels=[RECURATION_LABEL]): + if recuration_issue.title.lower().startswith(RECURATION_ISSUE_NAME_PREFIX): + parts = recuration_issue.title[len(RECURATION_ISSUE_NAME_PREFIX):].split() + key = parts[0].strip().lower() + issues[key].append(recuration_issue) + return issues + + +@click.command() +@args_from_config(GitlabConfig) +def main(project_id: int, url: str, token: str): + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + _do_it(project) + + +def _do_it(project: Project) -> None: + for title, issues in get_recuration_issues(project).items(): + if len(issues) > 1: + print(title, issues) + for issue in issues: + print(f'https://gitlab.scai.fraunhofer.de/charles.hoyt/hbp/issues/{issue.get_id()}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/make_recuration_issues.py b/src/pybel_tools/curation/recuration/make_recuration_issues.py new file mode 100644 index 00000000..0cd6422d --- /dev/null +++ b/src/pybel_tools/curation/recuration/make_recuration_issues.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +"""A one-off script for making issues in GitLab for BEL graphs without re-curation. + +Run with: ``python -m hbp.curation.recuration.make_recuration_issues``. + +If you just want to test what would happen, run with ``--dry``. +""" + +import os + +import click +import hbp_knowledge +from easy_config.contrib.click import args_from_config +from pybel import BELGraph +from pybel.constants import ANNOTATIONS, RELATION, UNQUALIFIED_EDGES +from pybel_git.gitlab import GitlabConfig + +from .utils import ( + CURATION_LABEL, RECURATION_ISSUE_MISSING_DESCRIPTION, + RECURATION_ISSUE_NAME_PREFIX, RECURATION_LABEL, get_curation_issues, get_recuration_issues, +) + + +@click.command() +@args_from_config(GitlabConfig) +@click.option('--dry', is_flag=True) +@click.option('-v', '--verbose', is_flag=True) +def main(project_id: int, url: str, token: str, dry: bool, verbose: bool): + """Reorganize re-curation issues.""" + gitlab_config = GitlabConfig.load( # noqa: S106 + project_id=project_id, + url=url, + token=token, + ) + project = gitlab_config.get_project() + + click.echo('Getting re-curation issues') + recuration_issues = get_recuration_issues(project) + click.echo(f'Retrieved {len(recuration_issues)} re-curation issues labeled "{RECURATION_LABEL}"') + + click.echo('Getting curation issues') + original_issues = get_curation_issues(project) + click.echo(f'Retrieved {len(original_issues)} curation issues labeled "{CURATION_LABEL}"') + + overlap = set(recuration_issues).intersection(original_issues) + click.echo(f'Overlap: {len(overlap)}') + + for i in recuration_issues: + if i not in original_issues: + print(i) + for i in original_issues: + if i not in recuration_issues: + print(i) + + click.echo('Getting graphs') + graphs = hbp_knowledge.get_graphs() + click.echo(f'Got {len(graphs)} graphs') + + new_count, closed_count, done_count, pending_count = 0, 0, 0, 0 + for path, graph in graphs.items(): + recurated = graph_has_recuration(graph) + basename = os.path.basename(path)[:-len('.bel')] + recuration_issue = recuration_issues.get(basename) + + original_issue = original_issues.get(basename) + ss = '\n' + '\n- '.join(graph.summary_str().split('\n')) + if original_issue: + description = f'Original issue: #{original_issue.get_id()}\n{ss}' + else: + description = RECURATION_ISSUE_MISSING_DESCRIPTION + f'\n{ss}' + + if recuration_issue is None and recurated and verbose: + click.echo(f'✅ {basename} already re-curated') + + elif recuration_issue is None and not recurated: + title = make_issue_title(basename) + click.echo(f'❌ {basename} creating issue: {title}') + new_count += 1 + if not dry: + recuration_issue = project.issues.create({ + 'title': title, + 'description': description, + }) + recuration_issue.labels.append(RECURATION_LABEL) + recuration_issue.save() + + elif recuration_issue and recurated: + if recuration_issue.state == 'opened': + click.echo(f'🚪 {basename} closing issue {recuration_issue.title}') + closed_count += 1 + if not dry: + recuration_issue.state_event = 'close' + recuration_issue.save() + else: + if verbose: + click.echo(f'✅ {basename} already done') + done_count += 1 + + elif recuration_issue and not recurated: + click.echo(f'❌ {basename} not yet re-curated (#{recuration_issue.get_id()})') + pending_count += 1 + + if recuration_issue is not None and graph.description != description: + click.echo(f'{basename} updating description') + if not dry: + recuration_issue.description = description + recuration_issue.save() + + click.echo(f''' +Summary +=========== + New {new_count} + Closed {closed_count} +Pending {pending_count} + Done {done_count} +''') + + +def make_issue_title(name: str) -> str: + """Format the graph's name into an issue name.""" + return f'{RECURATION_ISSUE_NAME_PREFIX}{name}' + + +def graph_has_recuration(graph: BELGraph) -> bool: + """Check that all edges have re-curation.""" + return all( + ANNOTATIONS in data and 'Confidence' in data[ANNOTATIONS] + for _, _, data in graph.edges(data=True) + if data[RELATION] not in UNQUALIFIED_EDGES + ) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/recuration/utils.py b/src/pybel_tools/curation/recuration/utils.py new file mode 100644 index 00000000..c0d41d20 --- /dev/null +++ b/src/pybel_tools/curation/recuration/utils.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- + +"""Utility functions for re-curation tools.""" + +from typing import Mapping + +from gitlab.v4.objects import Issue + +CURATION_LABEL = 'Curation' +RECURATION_LABEL = 'Re-curation' +RECURATION_ISSUE_NAME_PREFIX = 're-curate '.lower() +CURATION_ISSUE_NAME_PREFIX = 'curate '.lower() +RECURATION_ISSUE_MISSING_DESCRIPTION = 'Missing original issue.' + + +def get_recuration_issues(project) -> Mapping[str, Issue]: + return _get_issue_prefixed(project, RECURATION_LABEL, RECURATION_ISSUE_NAME_PREFIX) + + +def get_curation_issues(project, all_issues: bool = True) -> Mapping[str, Issue]: + return _get_issue_prefixed(project, CURATION_LABEL, CURATION_ISSUE_NAME_PREFIX, all_issues=all_issues) + + +def _get_issue_prefixed(project, label, prefix, all_issues: bool = True): + issues = {} + for issue in project.issues.list(all=all_issues, labels=[label]): + if issue.title.lower().startswith(prefix): + title = issue.title[len(prefix):] + parts = title.split() + key = parts[0].strip().strip(':').lower() + issues[key] = issue + return issues diff --git a/src/pybel_tools/curation/reporting/__init__.py b/src/pybel_tools/curation/reporting/__init__.py new file mode 100644 index 00000000..a6069c92 --- /dev/null +++ b/src/pybel_tools/curation/reporting/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- + +"""Scripts for reporting on curation.""" diff --git a/src/pybel_tools/curation/reporting/check_bad_names.py b/src/pybel_tools/curation/reporting/check_bad_names.py new file mode 100644 index 00000000..0eda7022 --- /dev/null +++ b/src/pybel_tools/curation/reporting/check_bad_names.py @@ -0,0 +1,22 @@ +# -*- coding: utf-8 -*- + +"""This script checks which of the BEL graphs have unhelpful names.""" + +import re + +import click +from hbp_knowledge import get_graphs + +BAD_NAME = re.compile(r'.*\d{4}$') + + +@click.command() +def main(): + """Check which graphs' names end with a year.""" + for name, graph in get_graphs().items(): + if BAD_NAME.match(graph.name): + click.echo(f'{graph.name} {graph.path}') + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py b/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py new file mode 100644 index 00000000..c43a8d70 --- /dev/null +++ b/src/pybel_tools/curation/reporting/summarize_effort_per_curator.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +"""Summarize how much effort each curator has put.""" + +from collections import Counter, defaultdict +from typing import Mapping + +import click + +from pybel import BELGraph + + +def summarize_effort_per_graph(graphs: Mapping[str, BELGraph]): + """Summarize the effort per curator.""" + r = defaultdict(list) + for name, graph in graphs.items(): + if ',' in graph.authors: + first_author = graph.authors.split(',')[0].strip() + elif ' and ' in graph.authors: + i = graph.authors.find(' and ') + first_author = graph.authors[:i] + else: # sole author + first_author = graph.authors + + r[first_author].append(graph) + + max_author_width = max(map(len, r)) + + click.echo('= Author Graphs =') + graph_counter = Counter({ + author: len(graphs) + for author, graphs in r.items() + }) + for author, graph_count in graph_counter.most_common(): + click.echo(f'{author:{max_author_width}} {graph_count:>}') + + click.echo('\n= Author Edges =') + edge_counter = Counter({ + author: sum(graph.number_of_edges() for graph in graphs) + for author, graphs in r.items() + }) + for author, edge_count in edge_counter.most_common(): + click.echo(f'{author:{max_author_width}} {edge_count:>}') + + +@click.command() +def main(): + """Summarize the effort per curator.""" + import hbp_knowledge + + graphs: Mapping[str, BELGraph] = hbp_knowledge.get_graphs() + summarize_effort_per_graph(graphs) + + +if __name__ == '__main__': + main() diff --git a/src/pybel_tools/curation/utils.py b/src/pybel_tools/curation/utils.py new file mode 100644 index 00000000..d3e5fb3c --- /dev/null +++ b/src/pybel_tools/curation/utils.py @@ -0,0 +1,83 @@ +# -*- coding: utf-8 -*- + +"""Curation utilities.""" + +from typing import Iterable, List, Optional + +import click +from gitlab.v4.objects import Project + +from pybel.manager.citation_utils import get_pubmed_citation_response + +min_year_option = click.option( + '--min-year', type=int, default=2001, show_default=True, + help='Minimum publication year. Before 2001 is dubious.', +) + + +def make_issues_from_pmids( + project: Project, + pmids: Iterable[str], + min_year: Optional[int] = None, + allow_closed: bool = False, + make_issues: bool = False, + labels: Optional[List[str]] = None, +) -> None: + """Make issues on the GitLab project for the given articles.""" + results = get_pubmed_citation_response(pmids)['result'] + existing_issue_titles = { + issue.title + for issue in project.issues.list(all=True) + } + + for pmid in pmids: + result = results[pmid] + first_author_surname = result['sortfirstauthor'].split()[0].lower() + pubyear = int(result['sortpubdate'].split('/')[0]) + if min_year is not None and pubyear < min_year: + continue + + pmc = get_pmc_from_result(result) + if pmc is not None: + issue_title = f'''{first_author_surname}{pubyear} (pmid:{pmid}, pmc:{pmc}) "{result['title']}"''' + elif allow_closed: + issue_title = f'''{first_author_surname}{pubyear} (pmid:{pmid}) "{result['title']}"''' + else: + continue + + if make_issues and issue_title not in existing_issue_titles: + project_issue = project.issues.create({ + 'title': f'Curate {issue_title}', + # 'description': 'Something useful here.' + }) + if pmc is not None: + project_issue.labels.append('Availability: PMC') + project_issue.labels = ['Curation'] + if labels is not None: + project_issue.labels.extend(labels) + project_issue.save() + + +def get_pmc_from_result(result) -> Optional[str]: + for article_id in result['articleids']: + if article_id['idtype'] == 'pmc': + return article_id['value'] + + +PDF_PREFIX = 'PDF:' +DOWNLOAD_PREFIX = 'Download:' + + +def get_issue_pdf(description: str) -> str: + """Get the link to the PDF for a given issue.""" + for line in description.splitlines(): + line = lstrip_list(line) + if line.startswith(PDF_PREFIX) and len(line) > len(PDF_PREFIX): + return line[len(PDF_PREFIX):] + if line.startswith(DOWNLOAD_PREFIX) and len(line) > len(DOWNLOAD_PREFIX): + return line[len(DOWNLOAD_PREFIX):] + + +def lstrip_list(s: str) -> str: + """Left strip a string of its bullet point.""" + return s.strip().lstrip('-').lstrip()