Skip to content

Commit

Permalink
Merge branch 'main' into halo-updates
Browse files Browse the repository at this point in the history
  • Loading branch information
cmungall authored Aug 2, 2023
2 parents 95fe932 + fcc882b commit ea2fa05
Show file tree
Hide file tree
Showing 3 changed files with 197 additions and 58 deletions.
50 changes: 40 additions & 10 deletions src/ontogpt/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ def extract(
write_extraction(results, output, output_format, ke)


# TODO: combine this command with pubmed_annotate - they are converging
@main.command()
@template_option
@model_option
Expand Down Expand Up @@ -369,16 +370,26 @@ def iteratively_generate_extract(
@recurse_option
@output_option_wb
@output_format_options
@click.option(
"--get-pmc/--no-get-pmc",
default=False,
help="Attempt to parse PubMed Central full text(s) instead of abstract(s) alone.",
)
@click.argument("pmid")
def pubmed_extract(pmid, template, output, output_format, **kwargs):
def pubmed_extract(pmid, template, output, output_format, get_pmc, **kwargs):
"""Extract knowledge from a single PubMed ID."""
logging.info(f"Creating for {template}")
pmc = PubmedClient()
text = pmc.text(pmid)
ke = SPIRESEngine(template, **kwargs)
logging.debug(f"Input text: {text}")
results = ke.extract_from_text(text)
write_extraction(results, output, output_format)
if get_pmc:
logging.info(f"Will try to retrieve PubMed Central text for {pmid}.")
textlist = pmc.text(pmid, pubmedcental=True)
else:
textlist = pmc.text(pmid)
for text in textlist:
ke = SPIRESEngine(template, **kwargs)
logging.debug(f"Input text: {text}")
results = ke.extract_from_text(text)
write_extraction(results, output, output_format)


@main.command()
Expand All @@ -387,14 +398,33 @@ def pubmed_extract(pmid, template, output, output_format, **kwargs):
@recurse_option
@output_option_wb
@output_format_options
@click.option(
"--limit",
default=20,
help="Total number of citation records to return.",
)
@click.option(
"--get-pmc/--no-get-pmc",
default=False,
help="Attempt to parse PubMed Central full text(s) instead of abstract(s) alone.",
)
@click.argument("search")
def pubmed_annotate(search, template, output, output_format, **kwargs):
"""Retrieve a collection of PubMed IDs for a search term; annotate them using a template."""
def pubmed_annotate(search, template, output, output_format, limit, get_pmc, **kwargs):
"""Retrieve a collection of PubMed IDs for a search term; annotate them using a template.
Example:
ontogpt pubmed-annotate -t phenotype "Takotsubo Cardiomyopathy: A Brief Review"
--get-pmc --model gpt-3.5-turbo-16k --limit 3
"""
logging.info(f"Creating for {template}")
pubmed_annotate_limit = 20 # TODO: make this a CLI argument
pubmed_annotate_limit = limit
pmc = PubmedClient()
pmids = pmc.get_pmids(search)
textlist = pmc.text(pmids[: pubmed_annotate_limit + 1])
if get_pmc:
logging.info("Will try to retrieve PubMed Central texts.")
textlist = pmc.text(pmids[: pubmed_annotate_limit + 1], pubmedcental=True)
else:
textlist = pmc.text(pmids[: pubmed_annotate_limit + 1])
for text in textlist:
ke = SPIRESEngine(template, **kwargs)
logging.debug(f"Input text: {text}")
Expand Down
197 changes: 149 additions & 48 deletions src/ontogpt/clients/pubmed_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,50 +66,10 @@ def _score_text(text: str, keywords: List[str]) -> int:
return score


def parse_pmxml(xml: str, raw: bool, autoformat: bool) -> List[str]:
"""Extract structured text from PubMed XML.
:param xml: One or more xml entries, as string
:param raw: if True, do not parse the xml beyond separating documents
:param autoformat: if True include title and abstract concatenated
:return: a list of strings, one per entry
"""
docs = []

# Preprocess the string to ensure it's valid xml
if not raw:
logging.info("Preprocessing all xml entries...")
header = "\n".join(xml.split("\n", 3)[0:3])
pmas_opener = "<PubmedArticleSet>"
pmas_closer = "</PubmedArticleSet>"
for remove_string in [header, pmas_opener, pmas_closer]:
xml = xml.replace(remove_string, "\n")
xml = pmas_opener + xml + pmas_closer

soup = BeautifulSoup(xml, "xml")

logging.info("Parsing all xml entries...")
for pa in soup.find_all(["PubmedArticle", "PubmedBookArticle"]):
if autoformat and not raw:
ti = ""
if pa.find("ArticleTitle"):
ti = pa.find("ArticleTitle").text
ab = ""
if pa.find("Abstract"): # Document may not have abstract
ab = pa.find("Abstract").text
kw = ""
if pa.find("KeywordList"): # Document may not have MeSH terms or keywords
kw = [tag.text for tag in pa.find_all("Keyword")]
# else:
# kw = ""
txt = f"Title: {ti}\nAbstract: {ab}\nKeywords: {'; '.join(kw)}"
elif raw:
txt = str(pa)
else:
txt = soup.get_text()
docs.append(txt)

return docs
def clean_pmids(ids: list[PMID]) -> list[PMID]:
"""Remove prefixes from a list of PMIDs, returning the new list."""
clean_ids = [id.replace("PMID:", "", 1) for id in ids]
return clean_ids


@dataclass
Expand All @@ -119,6 +79,8 @@ class PubmedClient:
This class is a wrapper around the Entrez API.
"""

# TODO: this doesn't need to be hardcoded
# and may vary based on the model in use
max_text_length = 3000

try:
Expand Down Expand Up @@ -210,13 +172,14 @@ def get_pmids(self, term: str) -> List[str]:
return pmids

def text(
self, ids: Union[list[PMID], PMID], raw=False, autoformat=True
self, ids: Union[list[PMID], PMID], raw=False, autoformat=True, pubmedcental=False
) -> Union[list[str], str]:
"""Get the text of one or more papers from their PMIDs.
:param ids: List of PubMed IDs, or string with single PMID
:param raw: if True, do not parse the xml, just return the raw output with tags
:param autoformat: if True include title and abstract concatenated
:param pubmedcentral: if True, retreive text from PubMed Central where possible
:return: the text of a single entry, or a list of strings for text of multiple entries
"""
batch_size = 200
Expand All @@ -228,7 +191,7 @@ def text(
singledoc = True
else:
singledoc = False
clean_ids = [id.replace("PMID:", "", 1) for id in ids]
clean_ids = clean_pmids(ids)
ids = clean_ids

# this will store the document data
Expand Down Expand Up @@ -369,7 +332,9 @@ def text(
# Parse that xml - this returns a list of strings
# if raw is True, the tags are kept, but we still get a list of docs
# and we don't truncate them
these_docs = parse_pmxml(xml=xml_data, raw=raw, autoformat=autoformat)
these_docs = self.parse_pmxml(
xml=xml_data, raw=raw, autoformat=autoformat, pubmedcentral=pubmedcental
)

txt = []
for doc in these_docs:
Expand All @@ -381,12 +346,58 @@ def text(
txt.append(shortdoc)
else:
txt.append(doc)
if singledoc:
if singledoc and not pubmedcental:
onetxt = txt[0]
txt = onetxt

return txt

def pmc_text(self, pmc_id: str) -> str:
"""Get the text of one PubMed Central entry.
Don't parse further here - just get the raw response.
:param pmc_id: List of PubMed IDs, or string with single PMID
:return: the text of a single entry as XML
"""
xml_data = ""

fetch_url = EUTILS_URL + "efetch.fcgi"

if self.email and self.ncbi_key:
params = {
"db": "pmc",
"id": pmc_id,
"rettype": "xml",
"retmode": "xml",
"email": self.email,
"api_key": self.ncbi_key,
}
else:
params = {"db": "pmc", "id": pmc_id, "rettype": "xml", "retmode": "xml"}

response = requests.get(fetch_url, params=parse.urlencode(params, safe=","))

trying = True
try_count = 0
while trying:
if response.status_code == 200:
xml_data = response.text
trying = False
else:
logging.error(
f"Encountered error in fetching from PubMed Central: {response.status_code}"
)
try_count = try_count + 1
if try_count < RETRY_MAX:
logging.info("Trying again...")
time.sleep(1)
else:
logging.info(f"Giving up - last status code {response.status_code}")
trying = False
logging.info(f"Retrieved PubMed Central document data for {pmc_id}.")

return xml_data

def search(self, term: str, keywords: List[str] = None) -> List[PMID]:
"""Get the quality-scored text of PubMed papers relating to a search term and keywords.
Expand Down Expand Up @@ -416,3 +427,93 @@ def search(self, term: str, keywords: List[str] = None) -> List[PMID]:
score = id_and_score[1]
logging.debug(f"Yielding {pmid} with score {score} ")
yield f"{pmid}"

def parse_pmxml(self, xml: str, raw: bool, autoformat: bool, pubmedcentral: bool) -> List[str]:
"""Extract structured text from PubMed and PubMed Central XML.
:param xml: One or more xml entries, as string
:param raw: if True, do not parse the xml beyond separating documents
:param autoformat: if True include title and abstract concatenated
Otherwise the output will include ALL text contents besides XML tags
:param pubmedcentral: if True replace abstract with PubMed Central text
If there isn't a PMC ID, just use the abstract.
If there is a PMC ID, use the abstract AND chunk the body text.
This means the same ID may have multiple entries and may require multiple
queries to the LLM.
:return: a list of strings, one per entry
"""
docs = []

# Preprocess the string to ensure it's valid xml
if not raw:
logging.info("Preprocessing all xml entries...")
header = "\n".join(xml.split("\n", 3)[0:3])
pmas_opener = "<PubmedArticleSet>"
pmas_closer = "</PubmedArticleSet>"
for remove_string in [header, pmas_opener, pmas_closer]:
xml = xml.replace(remove_string, "\n")
xml = pmas_opener + xml + pmas_closer

soup = BeautifulSoup(xml, "xml")

logging.info("Parsing all xml entries...")
for pa in soup.find_all(["PubmedArticle", "PubmedBookArticle"]):
# First check the PMID, and if requested, any PMC ID
pmid = ""
if pa.find("PMID"): # If this is missing something has gone Wrong
pmid = pa.find("PMID").text
pmc_id = ""
has_pmc_id = False
if (
pa.find("PubmedData").find("ArticleIdList").find("ArticleId", {"IdType": "pmc"})
and pubmedcentral
):
pmc_id = (
pa.find("PubmedData")
.find("ArticleIdList")
.find("ArticleId", {"IdType": "pmc"})
.text
)
has_pmc_id = True
if autoformat and not raw and not has_pmc_id: # No PMC ID - just use title+abstract
ti = ""
if pa.find("ArticleTitle"):
ti = pa.find("ArticleTitle").text
ab = ""
if pa.find("Abstract"): # Document may not have abstract
ab = pa.find("Abstract").text
kw = ""
if pa.find("KeywordList"): # Document may not have MeSH terms or keywords
kw = [tag.text for tag in pa.find_all("Keyword")]
txt = f"Title: {ti}\nKeywords: {'; '.join(kw)}\nPMID: {pmid}\nAbstract: {ab}"
docs.append(txt)
elif autoformat and not raw and has_pmc_id: # PMC ID - get and use that text instead
fulltext = self.pmc_text(pmc_id)
fullsoup = BeautifulSoup(fulltext, "xml")
body = ""
if fullsoup.find("pmc-articleset").find("article").find("body"):
body = fullsoup.find("pmc-articleset").find("article").find("body").text
body = body.replace("\n", " ")
ti = ""
if pa.find("ArticleTitle"):
ti = pa.find("ArticleTitle").text
if pa.find("Abstract"): # Document may not have abstract
body = pa.find("Abstract").text + body
kw = ""
if pa.find("KeywordList"): # Document may not have MeSH terms or keywords
kw = [tag.text for tag in pa.find_all("Keyword")]

id_txt = f"Title: {ti}\nKeywords: {'; '.join(kw)}\nPMID: {pmid}\nPMCID: {pmc_id}\n"
full_max_len = self.max_text_length - len(id_txt)
chunktxt = [body[i : i + full_max_len] for i in range(0, len(body), full_max_len)]
for txt in chunktxt:
docs.append(id_txt + txt)
logging.warning(
f'Truncating entry containing "{txt[:50]}" to {self.max_text_length} chars'
)
elif raw:
docs.append(str(pa))
else:
docs.append(soup.get_text())

return docs
8 changes: 8 additions & 0 deletions src/ontogpt/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ models:
creators:
- OpenAI

- name: MODEL_GPT_3_5_TURBO_16K
alternative_names:
- "gpt-3.5-turbo-16k"
- "openai-gpt-3.5-turbo-16k"
provider: OpenAI
creators:
- OpenAI

- name: MODEL_GPT_4
alternative_names:
- "gpt-4"
Expand Down

0 comments on commit ea2fa05

Please sign in to comment.