Skip to content

Commit

Permalink
Merge branch 'master' (including tr13312 - JCI Insights) into tr13204
Browse files Browse the repository at this point in the history
Conflicts:
	PdfParser.py
  • Loading branch information
nidak21 committed Aug 14, 2020
2 parents 449cb0e + d0a5b6e commit db23865
Show file tree
Hide file tree
Showing 47 changed files with 1,367 additions and 230 deletions.
174 changes: 174 additions & 0 deletions ExtractedTextSet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@

"""
Name: ExtractedTextSet.py
Purpose:
This module provides utilities for recovering the extracted text for
references (bib_refs records) in the database.
Extracted text is stored in the bib_workflow_data table in the database,
but it is stored split into sections (body, references, supplemental, ...),
and it is not so easy to recover the full text concatenated back together.
The ExtractedTextSet class defined here does this for you.
Convenience functions for building an ExtractedTextSet for a set of
_refs_keys are also provided.
"""

def getExtractedTextSet(db, # an initialized db module
refKeyList, # list of _ref_keys
):
"""
Return an ExtractedTextSet for the references with the specified keys.
Assumes refKeyList is small enough to format into a select statement.
Example:
import ExtractedTextSet
import db
db.set_sqlServer("bhmgidevdb01")
db.set_sqlDatabase("prod")
db.set_sqlUser("mgd_public")
db.set_sqlPassword("mgdpub")
refKeys = [390554, 390545]
ets = ExtractedTextSet.getExtractedTextSet(db, refKeys)
for r in refKeys:
text = ets.getExtText(r)
...
"""
query = '''
select bd._refs_key, t.term "text_type", bd.extractedtext "text_part"
from bib_workflow_data bd join voc_term t on
(bd._extractedtext_key = t._term_key)
where bd._refs_key in ( %s )
''' % ','.join([ str(r) for r in refKeyList ])
results = db.sql([query], 'auto')
ets = ExtractedTextSet(results[-1])
return ets
#-----------------------------------

def getExtractedTextSetForTable(db, # an initialized db module
tmpTableName, # (string) name of tmp table
):
"""
Return an ExtractedTextSet for the references represented in a tmpTable
in the database.
The only requirement for the tmpTable is that it has a _refs_key field
(ideally, it should have an index on this field too for efficiency)
"""
query = '''
select r._refs_key, t.term "text_type", bd.extractedtext "text_part"
from %s r join bib_workflow_data bd on (r._refs_key = bd._refs_key)
join voc_term t on (bd._extractedtext_key = t._term_key)
''' % tmpTableName
results = db.sql([query], 'auto')
ets = ExtractedTextSet(results[-1])
return ets
#-----------------------------------

class ExtractedTextSet (object):
"""
IS a collection of extracted text records (from multiple references)
HAS each extracted text record is dict with fields
{'_refs_key' : int, 'text_type': (e.g, 'body', 'references'),
'text_part': text}
The records may have other fields too that are not used here.
The field names '_refs_key', 'text_type', 'text_part' are specifiable.
DOES (1)collects and concatenates all the fields for a given _refs_key into
a single text field in the correct order - thus recapitulating the
full extracted text.
(2) getExtText(refKey) - get the extracted text for a given _refs_key
(3) join a set of basic reference records to their extracted text
"""
# from Vocab_key = 142 (Lit Triage Extracted Text Section vocab)
# These are the expected values for the 'text_type' field.
validTextTypes = [ 'body', 'reference',
'author manuscript fig legends',
'star methods',
'supplemental', ]
#-----------------------------------

def __init__(self,
extTextRcds, # list of rcds as above
keyLabel='_refs_key', # name of the reference key field
typeLabel='text_type', # name of the text type field
textLabel='text_part', # name of the text field
):
self.keyLabel = keyLabel
self.typeLabel = typeLabel
self.textLabel = textLabel
self.extTextRcds = extTextRcds
self._gatherExtText()
#-----------------------------------

def hasExtText(self, refKey ):
""" Return True if this ExtractedTextSet has text for refKey
"""
return str(refKey) in self.key2TextParts
#-----------------------------------

def getExtText(self, refKey ):
""" Return the text for refKey (or '' if there is no text)
"""
extTextDict = self.key2TextParts.get(str(refKey),{})

text = extTextDict.get('body','') + \
extTextDict.get('reference', '') + \
extTextDict.get('author manuscript fig legends', '') + \
extTextDict.get('star methods', '') + \
extTextDict.get('supplemental', '')
return text
#-----------------------------------

def joinRefs2ExtText(self,
refRcds,
refKeyLabel='_refs_key',
extTextLabel='ext_text',
allowNoText=True,
):
"""
Assume refRcds is a list of records { refKeyLabel : xxx, ...}
For each record in the list, add a field: extTextLabel: text
so that the extracted text becomes part of the record.
If allowNoText is False, then an exception is raised if a refRcd is
found with no extracted text.
"""
for r in refRcds:
refKey = str(r[refKeyLabel])

if not allowNoText and refKey not in self.key2TextParts:
raise ValueError("No extracted text found for '%s'\n" % \
str(refKey))
r[extTextLabel] = self.getExtText(refKey)

return refRcds
#-----------------------------------

def _gatherExtText(self, ):
"""
Gather the extracted text sections for each _refs_key
Return dict { _refs_key: { extratedTextType : text } }
E.g., { '12345' : { 'body' : 'body section text',
'references' : 'ref section text',
'star methods': '...text...',
} }
(we force all _refs_keys to strings so user can use either int or str)
"""
resultDict = {}
for r in self.extTextRcds:
refKey = str(r[self.keyLabel])
textType = r[self.typeLabel]
textPart = r[self.textLabel]

if textType not in self.validTextTypes:
raise ValueError("Invalid extracted text type: '%s'\n" % \
textType)
if refKey not in resultDict:
resultDict[refKey] = {}

resultDict[refKey][textType] = textPart

self.key2TextParts = resultDict
return self.key2TextParts
#-----------------------------------
# end class ExtractedTextSet -----------------------------------

5 changes: 5 additions & 0 deletions HISTORY
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
TAG:
DATE: 08/14/2020
STAFF: jak
Merge master (including tr13312) into tr13204 - python 3

TAG: lib_py_littriage-6-0-15-3
DATE: 03/10/2020
STAFF: lec
Expand Down
Loading

0 comments on commit db23865

Please sign in to comment.