From 843dacf79f2e681f5f98e12c6589917ea47cc3f2 Mon Sep 17 00:00:00 2001 From: Lori Corbani Date: Thu, 2 Apr 2020 07:29:03 -0400 Subject: [PATCH] TR13204/Infrastructure/python 3 --- HttpRequestGovernor.py | 9 +- Install | 5 +- PdfParser.py | 549 +++++++++++++------------ Pdfpath.py | 45 +- PubMedAgent.py | 545 +++++++++++++------------ PubMedCentralAgent.py | 15 +- extractedTextSplitter.py | 857 +++++++++++++++++++-------------------- 7 files changed, 1008 insertions(+), 1017 deletions(-) diff --git a/HttpRequestGovernor.py b/HttpRequestGovernor.py index 29edeb4..40b2714 100755 --- a/HttpRequestGovernor.py +++ b/HttpRequestGovernor.py @@ -1,4 +1,3 @@ -# Name: HttpRequestGovernor.py # Purpose: provides a class for managing the frequency with which we can make HTTP requests, # to ensure a configurable amount of "niceness" when reading from other sites # Notes: @@ -14,7 +13,7 @@ # You can also ask the governor to report on its statistics so far. import time -import urllib2 +import urllib.request, urllib.error, urllib.parse import runCommand # constants for convenience @@ -31,7 +30,7 @@ def readURL (url): # Purpose: given constraints on reading from https connections in python 2.7, we're just going # to shell out and use curl for this - # Returns: string returned + # Returns: str.returned # Throws: Exception if we have problems reading from 'url' stdout, stderr, statusCode = runCommand.runCommand("curl '%s'" % url) @@ -135,7 +134,7 @@ def get (self, url): try: response = readURL(url) - except Exception, e: + except Exception as e: raise Exception('The server could not fulfill the request: %s' % str(e)) return response @@ -150,4 +149,4 @@ def getStatistics (self): 'Average wait time: %6.3f sec' % (sum(self.timesWaited) / self.requestCount), 'Maximum wait time: %6.3f sec' % max(self.timesWaited), ] - return stats \ No newline at end of file + return stats diff --git a/Install b/Install index 282f04a..8d07993 100755 --- a/Install +++ b/Install @@ -48,7 +48,7 @@ fi # # Compile all Python scripts. # -python -c 'import compileall; compileall.compile_dir(".")' +${PYTHON} -m compileall -l -f . if [ $? -ne 0 ] then echo "Error compiling Python source" @@ -59,12 +59,11 @@ fi # Set the proper permissions on the Python files. # chmod 775 *.py -chmod 664 *.pyc # # Copy the Python files to the given library directory. # -for FILE in `ls *.py *.pyc` +for FILE in `ls *.py` do rm -f ${LIBRARY_DIRECTORY}/${FILE} cp -p ${FILE} ${LIBRARY_DIRECTORY} diff --git a/PdfParser.py b/PdfParser.py index 4a05939..94caa1d 100755 --- a/PdfParser.py +++ b/PdfParser.py @@ -1,4 +1,3 @@ -# Name: pdfParser.py # Purpose: provides functions for extracting text from PDF files # Notes: # 1. relies on MGI's litparser product to do the actual processing @@ -51,282 +50,282 @@ ###--- Functions ---### def setLitParserDir ( - directory # string; path to the litparser product - ): - # Purpose: initialize this module by identifying where to find the - # litparser product. - # Throws: Exception if 'directory' does not exist or if it does not - # contain the expected pdfGetFullText.sh script. - - global LITPARSER - - if not os.path.isdir(directory): - raise Exception('%s is not a directory' % directory) - - LITPARSER = os.path.join(directory, 'pdfGetFullText.sh') - if not os.path.exists(LITPARSER): - raise Exception('%s does not exist' % LITPARSER) - return - + directory # str. path to the litparser product + ): + # Purpose: initialize this module by identifying where to find the + # litparser product. + # Throws: Exception if 'directory' does not exist or if it does not + # contain the expected pdfGetFullText.sh script. + + global LITPARSER + + if not os.path.isdir(directory): + raise Exception('%s is not a directory' % directory) + + LITPARSER = os.path.join(directory, 'pdfGetFullText.sh') + if not os.path.exists(LITPARSER): + raise Exception('%s does not exist' % LITPARSER) + return + def hyphenate (s): - # Purpose: fix the hyphenation in Blood DOI IDs, which should be - # of the format "-yyyy-mm-others" where the first six digits - # are the year, the next two are the month, and then all the - # others come at the end - # Returns: string updated according to 'Purpose', or the input string - # if there are not enough digits - - digits = s.replace('-', '').replace('.', '').replace(' ', '') - if len(digits) < 7: - return s - if s.find('.') >= 0: - return '.%s%s%s' % (digits[:4], digits[4:6], digits[6:]) - else: - return '-%s-%s-%s' % (digits[:4], digits[4:6], digits[6:]) + # Purpose: fix the hyphenation in Blood DOI IDs, which should be + # of the format "-yyyy-mm-others" where the first six digits + # are the year, the next two are the month, and then all the + # others come at the end + # Returns: str.updated according to 'Purpose', or the input string + # if there are not enough digits + + digits = s.replace('-', '').replace('.', '').replace(' ', '') + if len(digits) < 7: + return s + if s.find('.') >= 0: + return '.%s%s%s' % (digits[:4], digits[4:6], digits[6:]) + else: + return '-%s-%s-%s' % (digits[:4], digits[4:6], digits[6:]) ###--- Classes ---### class PdfParser: - # Is: a parser that knows how to extract text from a PDF file - # Has: path to a PDF file, text from a PDF file - # Does: reads a PDF file from the file system, parses it, provides - # access to full text and various bits of information - - def __init__ (self, - pdfPath # string; path to PDF file to parse - ): - # Purpose: constructor - # Throws: Exception if the file specified in 'pdfPath' does - # not exist - - if not os.path.exists(pdfPath): - raise Exception('PDF file does not exist: %s' % pdfPath) - - self.pdfPath = pdfPath # string; path to the PDF file - self.fullText = None # string; text from the PDF file - self.loaded = False # boolean; did we read the file yet? - return - - def _loadFullText (self): - # Purpose: (private) get the text from the PDF file - # Throws: Exception if this library has not been properly - # initialized or if there are errors in parsing the file - # Notes: only loads the file once; if we already ready it, - # calling this function is a no-op. - - if self.loaded: - return - - if not LITPARSER: - raise Exception('Must initialize pdfParser library using setLitParserDir()') - - cmd = '%s %s' % (LITPARSER, self.pdfPath) - try: - (stdout, stderr, exitCode) = runCommand.runCommand(cmd) - except: - # error in attempting to execute parsing script - raise Exception('Failed to execute: %s' % cmd) - - # parsing script finished with an error code? - if (exitCode != 0): - raise Exception('Failed to parse %s' % self.pdfPath) - - # parsing was successful, so grab the text and note that we - # loaded the file - - self.fullText = stdout - self.loaded = True - return - - def getFirstDoiID (self): - # Purpose: return the first DOI ID from the PDF file - # Returns: string DOI ID or None (if no ID can be found) - # Throws: Exception if this library has not been properly - # initialized or if there are errors in parsing the file - # Note: this would be more aptly named getDoiID() - - self._loadFullText() - - if self.fullText: - - # PNAS only - if self.fullText.find('www.pnas.org') >= 0: - - match = PNAS_DOI_RE.search(self.fullText) - doiID = match.group(1) - - # may have DCSuppoemental - try: - if self.fullText.find('DCSupplemental') >= 0: - doiID = match.group(2) - except: - pass - - # PNAS DOI sometimes have missing '/' so can't be found using DOI_RE - # determine if missing '/' OR intervening SINGLE non-alphnumeric char - # if no '/' - if doiID.find('/') == -1: - if doiID.find('pnas') == 7: # there is no '/', add one - doiID = doiID.replace('10.1073', '10.1073/') - elif doiID.find('pnas') == 8: # there is a single intervening char - charToReplace = doiID[7] - doiID = doiID.replace(charToReplace, '/') - return doiID - - # all else - else: - self.fullText = self.fullText.replace(' journal.pone', 'journal.pone') - match = DOI_RE.search(self.fullText) - - if match: - doiID = match.group(1) - slash = doiID.find('/') - nl = doiID.find('\n') - - # special case for PLoS journals, which often have a line break in the ID. - # PLOS journals have 28-character DOI IDs 99.98% of the time. Out of 10,000+ - # PLOS DOI IDs in MGI so far, the only others are single IDs with 21 and 24 - # characters. So if we encounter a newline within the first 21 characters, - # we can just remove it. - # Also as of new pdftotext util we started using in Oct 2019, the 1st or 2nd - # ID occurrance in the paper may be truncated when a space is inserted - # instead of a line break. So try looking for a couple ID instances. - - if doiID.startswith('10.1371/'): - if (0 <= nl < 21): # remove potential nl - doiID = doiID.replace('\n', '', 1) - slash = doiID.find('/') - nl = doiID.find('\n') - i = 0 - while len(doiID) < 28: # try another occurrance - if i == 3: break # quit after 3 tries - i += 1 - - match = DOI_RE.search(self.fullText, match.end()) - if not match: break # odd, this shouldn't happen, bail - doiID = match.group(1) - slash = doiID.find('/') - nl = doiID.find('\n') - - if (0 <= nl < 21): # remove potential nl - doiID = doiID.replace('\n', '', 1) - slash = doiID.find('/') - nl = doiID.find('\n') - - # special case for Molecular and Cellular Biology journal, which has DOI IDs - # from 20 to 32 characters -- but which are often interrupted by line breaks - # in their new (circa late-2016) PDF format. As a workaround for the most - # common case, remove any newlines within the first 20 characters of the ID. - - if doiID.startswith('10.1128/'): - while 0 <= nl < 20: - doiID = doiID.replace('\n', '', 1) - nl = doiID.find('\n') - - # if there is a newline right after the slash, - # just remove it - - if (nl >= 0) and (nl == (slash+1)): - doiID = doiID.replace('\n', '', 1) - nl = doiID.find('\n') - - # if there is a newline later in the string, - # trim the ID at that point - - if (nl >= 0) and (nl > slash): - doiID = doiID[:nl] - - # strip off trailing parentheses, periods, - # brackets, and whitespace - doiID = re.sub('[\)\.\]\s]+$', '', doiID) - - # eLife IDs often errantly end with .001 - if (doiID.find('/eLife') > 0) and (doiID.endswith('.001')): - doiID = doiID[:-4] - - # if this is a Blood DOI ID, - # the hypenation sometimes needs tweaking - # may contain a '.' or a ' ' - if doiID.startswith('10.1182/blood'): - match = BLOOD_DOI_RE.search(self.fullText) - doiID = match.group(0) - numbers = match.group(1) - revised = hyphenate(numbers) - doiID = doiID.replace(numbers, revised) - doiID = doiID.replace(' ', '') - doiID = doiID.replace('\n', '') - - if doiID.startswith('10.1172/jci'): - match = JCI_DOI_RE.search(self.fullText) - doiID = match.group(0) - doiID = doiID.replace(' ', '') - - if doiID.startswith('10.1530/REP'): - match = REP_DOI_RE.search(self.fullText) - doiID = match.group(0) - doiID = doiID.replace('doi.org/', '') - doiID = doiID.replace(' ', '') - - # if this is a 10.1177/...Journal DOI ID, - # then remove the trailing 'Journal' text - match = JOURNAL_DOI_RE.match(doiID) - if match: - doiID = doiID.replace('Journal', '') - - # if this is a Science DOI ID, we instead need - # to find and return the last DOI ID for the - # PDF file. - if doiID.startswith('10.1126/science') or \ - doiID.startswith('10.1126/scisignal'): - doiID = self._getScienceID() - - return doiID - return None - - def _getScienceID (self): - # Science journals include the end of the prior article at the - # start of the PDF file. This means that we will usually - # return an inaccurate DOI ID for PDFs from Science journals. - # Instead, the desired ID occurs at the end of the article, - # shortly after the word "accepted". Use these criteria to - # get the desired ID and return it. - - # To get to this method, we must have already loaded the - # full text, and it must have been non-null. - - # Find all occurrences of the word 'accepted' and note the - # position of each. (It is possible that 'accepted' would - # occur in the start of the next article, so we can't just - # blindly take the last one.) - - acceptedPositions = [] - match = ACCEPTED_RE.search(self.fullText) - while match: - pos = match.regs[0][0] - acceptedPositions.append(pos) - match = ACCEPTED_RE.search(self.fullText, pos + 1) - - # Now start at the last occurrence of "accepted" and see if - # we can find a Science DOI ID reasonably soon after it. If - # so, that's our desired ID to return. If not, work back - # through the other instances of "accepted". - - # how close is close enough? (number of characters) - threshold = 80 - acceptedPositions.reverse() - - for accPos in acceptedPositions: - match = SCIENCE_DOI_RE.search(self.fullText, accPos) - if match: - if (match.regs[0][0] <= (accPos + threshold)): - return match.group(1) - return None - - def getText (self): - # Purpose: return the full text extracted from the PDF file - # Returns: string (full text) - - self._loadFullText() - if self.fullText: - return self.fullText - return None + # Is: a parser that knows how to extract text from a PDF file + # Has: path to a PDF file, text from a PDF file + # Does: reads a PDF file from the file system, parses it, provides + # access to full text and various bits of information + + def __init__ (self, + pdfPath # str. path to PDF file to parse + ): + # Purpose: constructor + # Throws: Exception if the file specified in 'pdfPath' does + # not exist + + if not os.path.exists(pdfPath): + raise Exception('PDF file does not exist: %s' % pdfPath) + + self.pdfPath = pdfPath # str. path to the PDF file + self.fullText = None # str. text from the PDF file + self.loaded = False # boolean; did we read the file yet? + return + + def _loadFullText (self): + # Purpose: (private) get the text from the PDF file + # Throws: Exception if this library has not been properly + # initialized or if there are errors in parsing the file + # Notes: only loads the file once; if we already ready it, + # calling this function is a no-op. + + if self.loaded: + return + + if not LITPARSER: + raise Exception('Must initialize pdfParser library using setLitParserDir()') + + cmd = '%s %s' % (LITPARSER, self.pdfPath) + try: + (stdout, stderr, exitCode) = runCommand.runCommand(cmd) + except: + # error in attempting to execute parsing script + raise Exception('Failed to execute: %s' % cmd) + + # parsing script finished with an error code? + if (exitCode != 0): + raise Exception('Failed to parse %s' % self.pdfPath) + + # parsing was successful, so grab the text and note that we + # loaded the file + + self.fullText = stdout + self.loaded = True + return + + def getFirstDoiID (self): + # Purpose: return the first DOI ID from the PDF file + # Returns: str.DOI ID or None (if no ID can be found) + # Throws: Exception if this library has not been properly + # initialized or if there are errors in parsing the file + # Note: this would be more aptly named getDoiID() + + self._loadFullText() + + if self.fullText: + + # PNAS only + if self.fullText.find('www.pnas.org') >= 0: + + match = PNAS_DOI_RE.search(self.fullText) + doiID = match.group(1) + + # may have DCSuppoemental + try: + if self.fullText.find('DCSupplemental') >= 0: + doiID = match.group(2) + except: + pass + + # PNAS DOI sometimes have missing '/' so can't be found using DOI_RE + # determine if missing '/' OR intervening SINGLE non-alphnumeric char + # if no '/' + if doiID.find('/') == -1: + if doiID.find('pnas') == 7: # there is no '/', add one + doiID = doiID.replace('10.1073', '10.1073/') + elif doiID.find('pnas') == 8: # there is a single intervening char + charToReplace = doiID[7] + doiID = doiID.replace(charToReplace, '/') + return doiID + + # all else + else: + self.fullText = self.fullText.replace(' journal.pone', 'journal.pone') + match = DOI_RE.search(self.fullText) + + if match: + doiID = match.group(1) + slash = doiID.find('/') + nl = doiID.find('\n') + + # special case for PLoS journals, which often have a line break in the ID. + # PLOS journals have 28-character DOI IDs 99.98% of the time. Out of 10,000+ + # PLOS DOI IDs in MGI so far, the only others are single IDs with 21 and 24 + # characters. So if we encounter a newline within the first 21 characters, + # we can just remove it. + # Also as of new pdftotext util we started using in Oct 2019, the 1st or 2nd + # ID occurrance in the paper may be truncated when a space is inserted + # instead of a line break. So try looking for a couple ID instances. + + if doiID.startswith('10.1371/'): + if (0 <= nl < 21): # remove potential nl + doiID = doiID.replace('\n', '', 1) + slash = doiID.find('/') + nl = doiID.find('\n') + i = 0 + while len(doiID) < 28: # try another occurrance + if i == 3: break # quit after 3 tries + i += 1 + + match = DOI_RE.search(self.fullText, match.end()) + if not match: break # odd, this shouldn't happen, bail + doiID = match.group(1) + slash = doiID.find('/') + nl = doiID.find('\n') + + if (0 <= nl < 21): # remove potential nl + doiID = doiID.replace('\n', '', 1) + slash = doiID.find('/') + nl = doiID.find('\n') + + # special case for Molecular and Cellular Biology journal, which has DOI IDs + # from 20 to 32 characters -- but which are often interrupted by line breaks + # in their new (circa late-2016) PDF format. As a workaround for the most + # common case, remove any newlines within the first 20 characters of the ID. + + if doiID.startswith('10.1128/'): + while 0 <= nl < 20: + doiID = doiID.replace('\n', '', 1) + nl = doiID.find('\n') + + # if there is a newline right after the slash, + # just remove it + + if (nl >= 0) and (nl == (slash+1)): + doiID = doiID.replace('\n', '', 1) + nl = doiID.find('\n') + + # if there is a newline later in the str. + # trim the ID at that point + + if (nl >= 0) and (nl > slash): + doiID = doiID[:nl] + + # strip off trailing parentheses, periods, + # brackets, and whitespace + doiID = re.sub('[\)\.\]\s]+$', '', doiID) + + # eLife IDs often errantly end with .001 + if (doiID.find('/eLife') > 0) and (doiID.endswith('.001')): + doiID = doiID[:-4] + + # if this is a Blood DOI ID, + # the hypenation sometimes needs tweaking + # may contain a '.' or a ' ' + if doiID.startswith('10.1182/blood'): + match = BLOOD_DOI_RE.search(self.fullText) + doiID = match.group(0) + numbers = match.group(1) + revised = hyphenate(numbers) + doiID = doiID.replace(numbers, revised) + doiID = doiID.replace(' ', '') + doiID = doiID.replace('\n', '') + + if doiID.startswith('10.1172/jci'): + match = JCI_DOI_RE.search(self.fullText) + doiID = match.group(0) + doiID = doiID.replace(' ', '') + + if doiID.startswith('10.1530/REP'): + match = REP_DOI_RE.search(self.fullText) + doiID = match.group(0) + doiID = doiID.replace('doi.org/', '') + doiID = doiID.replace(' ', '') + + # if this is a 10.1177/...Journal DOI ID, + # then remove the trailing 'Journal' text + match = JOURNAL_DOI_RE.match(doiID) + if match: + doiID = doiID.replace('Journal', '') + + # if this is a Science DOI ID, we instead need + # to find and return the last DOI ID for the + # PDF file. + if doiID.startswith('10.1126/science') or \ + doiID.startswith('10.1126/scisignal'): + doiID = self._getScienceID() + + return doiID + return None + + def _getScienceID (self): + # Science journals include the end of the prior article at the + # start of the PDF file. This means that we will usually + # return an inaccurate DOI ID for PDFs from Science journals. + # Instead, the desired ID occurs at the end of the article, + # shortly after the word "accepted". Use these criteria to + # get the desired ID and return it. + + # To get to this method, we must have already loaded the + # full text, and it must have been non-null. + + # Find all occurrences of the word 'accepted' and note the + # position of each. (It is possible that 'accepted' would + # occur in the start of the next article, so we can't just + # blindly take the last one.) + + acceptedPositions = [] + match = ACCEPTED_RE.search(self.fullText) + while match: + pos = match.regs[0][0] + acceptedPositions.append(pos) + match = ACCEPTED_RE.search(self.fullText, pos + 1) + + # Now start at the last occurrence of "accepted" and see if + # we can find a Science DOI ID reasonably soon after it. If + # so, that's our desired ID to return. If not, work back + # through the other instances of "accepted". + + # how close is close enough? (number of characters) + threshold = 80 + acceptedPositions.reverse() + + for accPos in acceptedPositions: + match = SCIENCE_DOI_RE.search(self.fullText, accPos) + if match: + if (match.regs[0][0] <= (accPos + threshold)): + return match.group(1) + return None + + def getText (self): + # Purpose: return the full text extracted from the PDF file + # Returns: str.(full text) + + self._loadFullText() + if self.fullText: + return self.fullText + return None diff --git a/Pdfpath.py b/Pdfpath.py index 2eadd12..2c1bc6e 100755 --- a/Pdfpath.py +++ b/Pdfpath.py @@ -1,4 +1,3 @@ -''' # # Pdfpath.py # @@ -17,7 +16,6 @@ # - TR12250/Lit Triage # 04/04/2019 jak # - TR12763 -''' import sys import os @@ -43,31 +41,30 @@ def getPdfpath(parentpath, mgiID): #print getPdfpath('/data/littriage', 'MGI:') #print '' - print 'MGI:1' - print getPdfpath('/data/littriage', 'MGI:1') - print '' + print ('MGI:1') + print (getPdfpath('/data/littriage', 'MGI:1')) + print ('') - print 'MGI:11' - print getPdfpath('/data/littriage', 'MGI:11') - print '' + print ('MGI:11') + print (getPdfpath('/data/littriage', 'MGI:11')) + print ('') - print 'MGI:111' - print getPdfpath('/data/littriage', 'MGI:111') - print '' + print ('MGI:111') + print (getPdfpath('/data/littriage', 'MGI:111')) + print ('') - print 'MGI:1111' - print getPdfpath('/data/littriage/', 'MGI:1111') - print '' + print ('MGI:1111') + print (getPdfpath('/data/littriage/', 'MGI:1111')) + print ('') - print 'MGI:11111' - print getPdfpath('/data/littriage/', 'MGI:11111') - print '' + print ('MGI:11111') + print (getPdfpath('/data/littriage/', 'MGI:11111')) + print ('') - print 'MGI:111111' - print getPdfpath('/data/littriage/', 'MGI:111111') - print '' - - print 'MGI:1111111' - print getPdfpath('/data/littriage/', 'MGI:1111111') - print '' + print ('MGI:111111') + print (getPdfpath('/data/littriage/', 'MGI:111111')) + print ('') + print ('MGI:1111111') + print (getPdfpath('/data/littriage/', 'MGI:1111111')) + print ('') diff --git a/PubMedAgent.py b/PubMedAgent.py index f5e99a0..f7a7568 100755 --- a/PubMedAgent.py +++ b/PubMedAgent.py @@ -1,4 +1,3 @@ -# Name: PubMedAgent.py # Purpose: to provide an easy means to fetch reference data from PubMed in # a variety of formats # Usage: @@ -10,7 +9,7 @@ # back data in your desired format using getReference(doiID) or getReferences(doiList) import string -import urllib +import urllib.request, urllib.parse, urllib.error import csv import xml.dom.minidom import os @@ -74,188 +73,188 @@ class PubMedReference: # error message is then accessible from getErrorMessage(). def __init__ (self, errorMessage = None): - self.pubMedID = None - self.doiID = None - self.title = None - self.authors = None - self.journal = None - self.date = None - self.year = None - self.issue = None - self.pages = None - self.abstract = None - self.volume = None - self.primaryAuthor = None - self.publicationType = None - # add other fields as needed - - self.errorMessage = errorMessage - - return + self.pubMedID = None + self.doiID = None + self.title = None + self.authors = None + self.journal = None + self.date = None + self.year = None + self.issue = None + self.pages = None + self.abstract = None + self.volume = None + self.primaryAuthor = None + self.publicationType = None + # add other fields as needed + + self.errorMessage = errorMessage + + return ###--- setter/getter methods ---### def isValid(self): - return self.errorMessage == None + return self.errorMessage == None def getErrorMessage(self): - return self.errorMessage + return self.errorMessage def setPubMedID(self, pmID): - self.pubMedID = pmID + self.pubMedID = pmID def getPubMedID(self): - return self.pubMedID + return self.pubMedID def setDoiID(self, doiID): - self.doiID = doiID + self.doiID = doiID def getDoiID(self): - return self.doiID + return self.doiID def setTitle(self, title): - self.title = title + self.title = title def getTitle(self): - return self.title + return self.title def setAuthors(self, authors): - self.authors = authors + self.authors = authors def getAuthors(self): - return self.authors + return self.authors def setJournal(self, journal): - self.journal = journal + self.journal = journal def getJournal(self): - return self.journal + return self.journal def setDate(self, date): - self.date = date + self.date = date def getDate(self): - return self.date + return self.date def setYear(self, year): - self.year = year + self.year = year def getYear(self): - return self.year + return self.year def setIssue(self, issue): - self.issue = issue + self.issue = issue def getIssue(self): - return self.issue + return self.issue def setPages(self, pages): - self.pages = pages + self.pages = pages def getPages(self): - return self.pages + return self.pages def setAbstract(self, abstract): - self.abstract = abstract + self.abstract = abstract def getAbstract(self): - return self.abstract + return self.abstract def setVolume(self, volume): - self.volume = volume + self.volume = volume def getVolume(self): - return self.volume + return self.volume def setPrimaryAuthor(self, pAuthor): - self.primaryAuthor = pAuthor + self.primaryAuthor = pAuthor def getPrimaryAuthor(self): - return self.primaryAuthor + return self.primaryAuthor def setPublicationType(self, publicationType): - self.publicationType = publicationType + self.publicationType = publicationType def getPublicationType(self): - return self.publicationType + return self.publicationType # add other accessors as needed class PubMedAgent: - # Is: an agent that interacts with PubMed to get reference data - # for DOI IDs - # Does: takes DOI IDs, queries PubMed, and returns PubMedReference - # objects for them - - def __init__ (self): - # Purpose: constructor - return - - def getPubMedID (self, doiID): - # Purpose: return the PubMed ID corresponding to this doiID, or None - # if there is no corresponding PubMed ID - # Throws: Exception if the URL returns an error - # Notes: 6/30 - not tested - - return self.getPubMedIDs([doiID])[doiID] - - def getPubMedIDs (self, doiList): - # Purpose: return a dictionary mapping from each DOI ID to its - # corresponding PubMed ID. If no PubMed ID for a given DOI ID, - # then that one maps to None. - # Throws: Exception if the URL returns an error - mapping = {} # {doiid: [pubMedId(s)], ...} - try: - #print '### Getting PubMed IDs ###\n' - for doiID in doiList: - forUrl = doiID - forUrl = doiID.replace('(', '*') - forUrl = doiID.replace(')', '*') - forUrl = doiID.replace(';', '*') - forUrl = doiID.replace(':', '*') - response = urllib.urlopen(ID_CONVERTER_URL % (XML, forUrl)) - record = string.strip(response.read()) - xmldoc = xml.dom.minidom.parseString(record) - pubmedIDs = xmldoc.getElementsByTagName("Id") - #print '*****\n\n' - #print ID_CONVERTER_URL % (XML, doiID) - #print record - #print 'pubmedIDs : ', str(pubmedIDs) - #print 'doiID : ', doiID - if doiID not in mapping: - mapping[doiID] = [] - if pubmedIDs == []: - mapping[doiID].append(None) - else: - for pmID in pubmedIDs: - #print 'pm: %s' % pmID.firstChild.data - mapping[doiID].append(pmID.firstChild.data) - except IOError, e: - if hasattr(e, 'code'): # HTTPError - print 'HTTP error code: ', e.code - raise Exception('HTTP error code: %s' % e.code) - elif hasattr(e, 'reason'): # URLError - print "Can't connect, reason: ", e.reason - raise Exception("Can't connect, reason: %s" % e.reason) - else: - raise Exception('Unknown exception: %s' % e) - - return mapping - - def getReferenceInfo(self, doiList): - # Purpose: stub to be implemented by child - return - - def getReference (self, doiID): - # Purpose: returns a dictionary that maps each DOI ID to its + # Is: an agent that interacts with PubMed to get reference data + # for DOI IDs + # Does: takes DOI IDs, queries PubMed, and returns PubMedReference + # objects for them + + def __init__ (self): + # Purpose: constructor + return + + def getPubMedID (self, doiID): + # Purpose: return the PubMed ID corresponding to this doiID, or None + # if there is no corresponding PubMed ID + # Throws: Exception if the URL returns an error + # Notes: 6/30 - not tested + + return self.getPubMedIDs([doiID])[doiID] + + def getPubMedIDs (self, doiList): + # Purpose: return a dictionary mapping from each DOI ID to its + # corresponding PubMed ID. If no PubMed ID for a given DOI ID, + # then that one maps to None. + # Throws: Exception if the URL returns an error + mapping = {} # {doiid: [pubMedId(s)], ...} + try: + #print '### Getting PubMed IDs ###\n' + for doiID in doiList: + forUrl = doiID + forUrl = doiID.replace('(', '*') + forUrl = doiID.replace(')', '*') + forUrl = doiID.replace(';', '*') + forUrl = doiID.replace(':', '*') + response = urllib.request.urlopen(ID_CONVERTER_URL % (XML, forUrl)) + record = str.strip(response.read()) + xmldoc = xml.dom.minidom.parseString(record) + pubmedIDs = xmldoc.getElementsByTagName("Id") + #print '*****\n\n' + #print ID_CONVERTER_URL % (XML, doiID) + #print record + #print 'pubmedIDs : ', str(pubmedIDs) + #print 'doiID : ', doiID + if doiID not in mapping: + mapping[doiID] = [] + if pubmedIDs == []: + mapping[doiID].append(None) + else: + for pmID in pubmedIDs: + #print 'pm: %s' % pmID.firstChild.data + mapping[doiID].append(pmID.firstChild.data) + except IOError as e: + if hasattr(e, 'code'): # HTTPError + print('HTTP error code: ', e.code) + raise Exception('HTTP error code: %s' % e.code) + elif hasattr(e, 'reason'): # URLError + print("Can't connect, reason: ", e.reason) + raise Exception("Can't connect, reason: %s" % e.reason) + else: + raise Exception('Unknown exception: %s' % e) + + return mapping + + def getReferenceInfo(self, doiList): + # Purpose: stub to be implemented by child + return + + def getReference (self, doiID): + # Purpose: returns a dictionary that maps each DOI ID to its # corresponding PubMedReference object(s) (or None, if there # is no reference data in PubMed for that DOI ID) - # DOI ID can map to multiple PubMed - # sc - this has not been tested - return self.getReferences([doiID])[doiID] - - def getReferences (self, doiList): - # Purpose: returns a dictionary that maps each DOI ID to its - # corresponding PubMedReference object(s) (or None, if there - # is no reference data in PubMed for that DOI ID) - # Notes: DOI ID can map to multiple PubMed - - # translate doiList to doiID/pubmedID dictionary - # pubMedDict = {doiID:pubMedID, ...} - #print 'getReferences doiList: %s' % doiList - - pubMedDict = self.getPubMedIDs(doiList) - - # call getReferenceInfo - which is implemented by the subclass. - - mapping = {} - #print '### Getting PubMed References ###' - for doiID in pubMedDict: - if doiID not in mapping: - mapping[doiID] = [] - pubMedIdList = pubMedDict[doiID] - refObject = None # default, for no pmID - #print 'pubMedIdList: %s' % pubMedIdList - for pubMedID in pubMedIdList: - if pubMedID == None: - mapping[doiID].append(refObject) - else: - refObject = self.getReferenceInfo(pubMedID) - mapping[doiID].append(refObject) - return mapping + # DOI ID can map to multiple PubMed + # sc - this has not been tested + return self.getReferences([doiID])[doiID] + + def getReferences (self, doiList): + # Purpose: returns a dictionary that maps each DOI ID to its + # corresponding PubMedReference object(s) (or None, if there + # is no reference data in PubMed for that DOI ID) + # Notes: DOI ID can map to multiple PubMed + + # translate doiList to doiID/pubmedID dictionary + # pubMedDict = {doiID:pubMedID, ...} + #print 'getReferences doiList: %s' % doiList + + pubMedDict = self.getPubMedIDs(doiList) + + # call getReferenceInfo - which is implemented by the subclass. + + mapping = {} + #print '### Getting PubMed References ###' + for doiID in pubMedDict: + if doiID not in mapping: + mapping[doiID] = [] + pubMedIdList = pubMedDict[doiID] + refObject = None # default, for no pmID + #print 'pubMedIdList: %s' % pubMedIdList + for pubMedID in pubMedIdList: + if pubMedID == None: + mapping[doiID].append(refObject) + else: + refObject = self.getReferenceInfo(pubMedID) + mapping[doiID].append(refObject) + return mapping class PubMedAgentJson (PubMedAgent): # Is: an agent that interacts with PubMed to get reference data @@ -264,8 +263,8 @@ class PubMedAgentJson (PubMedAgent): # for each reference # Note: Not implemented def __init__ (self): - # Purpose: constructor - return + # Purpose: constructor + return # override method used to format each reference, reporting JSON # for this class @@ -274,139 +273,139 @@ class PubMedAgentMedline (PubMedAgent): # Is: an agent that interacts with PubMed to get reference data # for DOI IDs # Does: takes DOI IDs, queries PubMed, and returns a Medline-formatted - # string for each reference + # str.for each reference def __init__ (self): - return + return # override method used to format each reference, reporting Medline # format for the PubMed request def getReferenceInfo(self, pubMedID): - # Purpose: Implementation of the superclass stub. Given a pubMedID, get a - # MedLine record, parse, create and return a PubMedReference object - # Throws: Exception if the URL returns an error - # Init the reference we will return - pubMedRef = None - try: - #print REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE) - response = urllib.urlopen(REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE)) - medLineRecord = string.strip(response.read()) - #print '"%s"' % medLineRecord - except IOError, e: - if hasattr(e, 'code'): # HTTPError - print 'http error code: ', e.code - raise Exception('HTTP error code: %s' % e.code) - elif hasattr(e, 'reason'): # URLError - print "Can't connect, reason: ", e.reason - raise Exception("Can't connect, reason: %s" % e.reason) - else: - raise Exception('Unknown exception: %s' % e) - - # if this pubMedID returns an error, create reference object with - # that error message, otherwise parse the record - if string.find(medLineRecord, 'Error occurred:') != -1: - pubMedRef = PubMedReference(errorMessage = medLineRecord) - else: - pubMedRef = PubMedReference() - tokens = string.split(medLineRecord, '\n') - - # Abstract, multilined w/o additional tag - isAB = 0 - abList = [] - - # author, multilined each with tag - auList = [] - - # title, multilined w/o additional tag - isTI = 0 - tiList = [] - - # publication type - isPT = 0 - - for line in tokens: - # parse MedLine format - - #print line - - if isTI == 1: - if line.startswith(' '): - tiList.append(string.strip(line)) - continue + # Purpose: Implementation of the superclass stub. Given a pubMedID, get a + # MedLine record, parse, create and return a PubMedReference object + # Throws: Exception if the URL returns an error + # Init the reference we will return + pubMedRef = None + try: + #print REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE) + response = urllib.request.urlopen(REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE)) + medLineRecord = str.strip(response.read()) + #print '"%s"' % medLineRecord + except IOError as e: + if hasattr(e, 'code'): # HTTPError + print('http error code: ', e.code) + raise Exception('HTTP error code: %s' % e.code) + elif hasattr(e, 'reason'): # URLError + print("Can't connect, reason: ", e.reason) + raise Exception("Can't connect, reason: %s" % e.reason) + else: + raise Exception('Unknown exception: %s' % e) + + # if this pubMedID returns an error, create reference object with + # that error message, otherwise parse the record + if str.find(medLineRecord, 'Error occurred:') != -1: + pubMedRef = PubMedReference(errorMessage = medLineRecord) + else: + pubMedRef = PubMedReference() + tokens = str.split(medLineRecord, '\n') + + # Abstract, multilined w/o additional tag + isAB = 0 + abList = [] + + # author, multilined each with tag + auList = [] + + # title, multilined w/o additional tag + isTI = 0 + tiList = [] + + # publication type + isPT = 0 + + for line in tokens: + # parse MedLine format + + #print line + + if isTI == 1: + if line.startswith(' '): + tiList.append(str.strip(line)) + continue else: - isTI = 0 + isTI = 0 - if isAB == 1: - if line.startswith(' '): - abList.append(string.strip(line)) - continue + if isAB == 1: + if line.startswith(' '): + abList.append(str.strip(line)) + continue else: - isAB = 0 + isAB = 0 - # strip by first '-' - try: - value = (map(string.strip,string.split(line, '-', 1)))[1] - # else use entire line + # strip by first '-' + try: + value = (list(map(str.strip,str.split(line, '-', 1))))[1] + # else use entire line except: - value = string.strip(line) - - # tags of interest - if line.startswith('PMID'): - pubMedRef.setPubMedID(value) - - elif line.startswith('TI'): - isTI = 1 - tiList.append(value) - - # skip 'AUID-' - elif line.startswith('AU -'): - if auList == []: - pubMedRef.setPrimaryAuthor(value) - auList.append(value) - - elif line.startswith('TA'): - pubMedRef.setJournal(value) - - elif line.startswith('DP'): - pubMedRef.setDate(value) - #print 'setting date in reference from: %s' % value - pubMedRef.setYear(string.split(value, ' ', 1)[0]) - - elif line.startswith('IP'): - pubMedRef.setIssue(value) - - elif line.startswith('PG'): - pubMedRef.setPages(value) - - elif line.startswith('AB'): - isAB = 1 - abList.append(value) - - elif line.startswith('VI'): - pubMedRef.setVolume(value) - - elif line.startswith('AID') and (string.find(line, '[doi]') > 0): - pubMedRef.setDoiID(string.strip(string.split(line, 'AID -')[1].split('[')[0])) - - elif line.startswith('PT'): - - # find last PT or use list - if isPT == 0: - if value == 'Review': - pubMedRef.setPublicationType(value) - isPT = 1 - elif value == 'Editorial': - pubMedRef.setPublicationType(value) - isPT = 1 - elif value == 'Comment': - pubMedRef.setPublicationType(value) - isPT = 1 - else: - pubMedRef.setPublicationType(value) - - pubMedRef.setAbstract(string.join(abList)) - pubMedRef.setAuthors(string.join(auList, '; ')) - pubMedRef.setTitle(string.join(tiList)) - - return pubMedRef + value = str.strip(line) + + # tags of interest + if line.startswith('PMID'): + pubMedRef.setPubMedID(value) + + elif line.startswith('TI'): + isTI = 1 + tiList.append(value) + + # skip 'AUID-' + elif line.startswith('AU -'): + if auList == []: + pubMedRef.setPrimaryAuthor(value) + auList.append(value) + + elif line.startswith('TA'): + pubMedRef.setJournal(value) + + elif line.startswith('DP'): + pubMedRef.setDate(value) + #print 'setting date in reference from: %s' % value + pubMedRef.setYear(str.split(value, ' ', 1)[0]) + + elif line.startswith('IP'): + pubMedRef.setIssue(value) + + elif line.startswith('PG'): + pubMedRef.setPages(value) + + elif line.startswith('AB'): + isAB = 1 + abList.append(value) + + elif line.startswith('VI'): + pubMedRef.setVolume(value) + + elif line.startswith('AID') and (str.find(line, '[doi]') > 0): + pubMedRef.setDoiID(str.strip(str.split(line, 'AID -')[1].split('[')[0])) + + elif line.startswith('PT'): + + # find last PT or use list + if isPT == 0: + if value == 'Review': + pubMedRef.setPublicationType(value) + isPT = 1 + elif value == 'Editorial': + pubMedRef.setPublicationType(value) + isPT = 1 + elif value == 'Comment': + pubMedRef.setPublicationType(value) + isPT = 1 + else: + pubMedRef.setPublicationType(value) + + pubMedRef.setAbstract(str.join(abList)) + pubMedRef.setAuthors(str.join(auList, '; ')) + pubMedRef.setTitle(str.join(tiList)) + + return pubMedRef diff --git a/PubMedCentralAgent.py b/PubMedCentralAgent.py index 6ec771c..41b3cf6 100755 --- a/PubMedCentralAgent.py +++ b/PubMedCentralAgent.py @@ -1,4 +1,3 @@ -# Name: PubMedCentralAgent.py # Purpose: provide an interface to various services at PubMed Central # Usage: # 1. Initialize the module by calling setToolName() and/or setEmailAddress() as desired to override @@ -7,7 +6,7 @@ # PMC IDs and look up) # 3. Run with it. -import urllib2 +import urllib.request, urllib.error, urllib.parse import xml.dom.minidom import HttpRequestGovernor @@ -68,7 +67,7 @@ def __init__ (self): def getPMCID (self, doiID): # Purpose: look up the PMC ID for a single DOI ID - # Returns: string (PMC ID) or None (if the DOI ID has no PMC ID) + # Returns: str.(PMC ID) or None (if the DOI ID has no PMC ID) # Throws: Exception if there are problems communicating with PubMed Central return self.getPMCIDs([ doiID ])[doiID] @@ -84,14 +83,14 @@ def getPMCIDs (self, doiIDs): return pmcIDs # strip leading & trailing spaces from IDs and split the list into chunks - sublists = _splitList(map(lambda x : x.strip(), doiIDs), 20) + sublists = _splitList([x.strip() for x in doiIDs], 20) for sublist in sublists: lines = HttpRequestGovernor.readURL(ID_CONVERTER_URL % (TOOL_NAME, EMAIL_ADDRESS, ','.join(sublist))) # Lines have comma-delimited columns. String values are in double-quotes. # Standardize lines by stripping out the double-quotes, then splitting on commas. - lines = map(lambda x: x.split(','), lines.replace('"', '').split('\n')) + lines = [x.split(',') for x in lines.replace('"', '').split('\n')] # first line will have column headers. We need DOI and PMCID columns. if 'DOI' not in lines[0]: @@ -119,7 +118,7 @@ def __init__ (self): def getUrl (self, pmcID): # Purpose: look up the download URL for a single PMC ID - # Returns: string (URL) or None (if the PMC ID has no file to download) + # Returns: str.(URL) or None (if the PMC ID has no file to download) # Throws: Exception if there are problems communicating with PubMed Central return self.getUrls([ pmcID ])[pmcID] @@ -136,7 +135,7 @@ def getUrls (self, pmcIDs): if not pmcIDs: return urls - for pmcID in map(lambda x: x.strip(), pmcIDs): + for pmcID in [x.strip() for x in pmcIDs]: lines = HttpRequestGovernor.readURL(PDF_LOOKUP_URL % pmcID) xmldoc = xml.dom.minidom.parseString(lines) @@ -154,4 +153,4 @@ def getUrls (self, pmcIDs): else: urls[pmcID] = None - return urls \ No newline at end of file + return urls diff --git a/extractedTextSplitter.py b/extractedTextSplitter.py index 1f80482..6dca9ef 100755 --- a/extractedTextSplitter.py +++ b/extractedTextSplitter.py @@ -1,4 +1,3 @@ -#!/usr/local/bin/python #!/usr/bin/env python2.4 #!/usr/bin/env python @@ -20,13 +19,13 @@ body - everything up to the next section references - the reference section manuscript figures - some manuscript PDFs have figures/tables after - the refs - (WE USE THE TERM "figure" to mean "figure/table" ) + the refs + (WE USE THE TERM "figure" to mean "figure/table" ) star*methods - some papers have an extended "methods" section - after the refs. This section is called "Star*Methods" + after the refs. This section is called "Star*Methods" supplemental data - indicated by a special MGI text tag inserted by - Nancy or someone in MGI when the supp data is added - to the PDF + Nancy or someone in MGI when the supp data is added + to the PDF The end of each section is the beginning of the next section. @@ -46,17 +45,17 @@ splitter = ExtTextSplitter() - # To just get the actual sections' string: + # To just get the actual sections' str. (body, refs, manuFigures, starMethods, suppData) = \ - splitter.splitSections(text) + splitter.splitSections(text) # To get Section objects that contain more info about each predicted section # (Section Objects contain: type, text, sPos, ePos, and a "reason" the # splitting algorithm made its prediction - see Section Class below): (bodyS, refsS, manuFiguresS, starMethodsS, suppDataS) = \ - splitter.findSections(text) + splitter.findSections(text) ################### Overview of the Splitting Algorithm: ################### @@ -67,8 +66,8 @@ If Refs start is found, search for Manuscript figures: search foward from there for a figure legend occurring before the start of Star Methods NOTE this will match any figure/table legend after the start of the refs, - even if they are not in an official "manuscript" section. - But ending the refs section at any legend is good + even if they are not in an official "manuscript" section. + But ending the refs section at any legend is good Body = start of text up to References start There are also minFraction (for all sections) and maxFraction (for references @@ -82,10 +81,10 @@ which may == end of extracted text. Algorithm approach: -You cannot search backward in a string with a regex. +You cannot search backward in a str.with a regex. Instead, we put all the different section heading patterns into one honking regex that we match (forward) against the extText. -(so we only make one pass through the string) +(so we only make one pass through the str. Then we can scan the list of all regex matches backward (or forward) as neededj. See class TypedRegexMatcher @@ -96,9 +95,9 @@ class Section - describes a section class ExtTextSplitter - implements the splitting algorithm class TypedRegexMatcher - takes regex's grouped by user defined "type", - combines them, lets you match against a string, - and gives you back lists of TypedMatch objects - that represent the matches found + combines them, lets you match against a str. + and gives you back lists of TypedMatch objects + that represent the matches found class TypedMatch - describes a match from TypedRegexMatcher """ import string @@ -108,12 +107,12 @@ class TypedMatch - describes a match from TypedRegexMatcher # Regex building functions # ---------------------------------- def spacedOutRegex(s): - # for given string, return regex pattern string that matches the chars - # in the string with optional spaces between the chars. + # for given str. return regex pattern str.that matches the chars + # in the str.with optional spaces between the chars. # Useful because sometimes the PDF to text extraction inserts spaces. reg = [] for c in s: - reg.append('[%s]' % c) + reg.append('[%s]' % c) return '[ ]*'.join(reg) #----------------------------------------------- @@ -133,42 +132,42 @@ class Section (object): IS an object that describes a section of an article (from extracted text) """ def __init__(self, secType, text='', reason='', sPos=None, ePos=None): - self.secType = secType # section name. see vocab above - self.text = text # the text of the section - self.reason = reason # reason this section start was chosen - # typically the string that we matched - # for section header. - self.sPos = sPos # start position within the article text - self.ePos = ePos # end pos - index of 1st char not in section + self.secType = secType # section name. see vocab above + self.text = text # the text of the section + self.reason = reason # reason this section start was chosen + # typically the str.that we matched + # for section header. + self.sPos = sPos # start position within the article text + self.ePos = ePos # end pos - index of 1st char not in section def __str__(self): - return "Section object: %s reason: '%s' %d %d\n'%s'\n" % \ - (self.secType, self.reason, self.sPos, self.ePos, self.text[:40]) + return "Section object: %s reason: '%s' %d %d\n'%s'\n" % \ + (self.secType, self.reason, self.sPos, self.ePos, self.text[:40]) #------------------ end Class SectionBoundary class ExtTextSplitter (object): #{ ''' Is: a class that knows how to split extracted text for a PDF - into multiple parts: - body, references, manuscript figs, "star methods", supp data - (any of these sections except body can be the empty string) - "Star Methods" is a methods section that some journals put after - the references and before any supplemental data. + into multiple parts: + body, references, manuscript figs, "star methods", supp data + (any of these sections except body can be the empty str. + "Star Methods" is a methods section that some journals put after + the references and before any supplemental data. Has: floats: minFraction, maxFraction and a TypedRegexMatcher - The min/maxFractions are used ... JIM: doc this better - once any supp data and "star methods" section are removed from the end - of the text. - If the length of the predicted reference section is - > maxFraction of the total text length from the end - or - < minFraction from the end - then the prediction is considered invalid, and the reference section - is set to '', and the body is not split. - - Does: splitSections() - get the section strings - findSections() - get descriptions of the section + The min/maxFractions are used ... JIM: doc this better + once any supp data and "star methods" section are removed from the end + of the text. + If the length of the predicted reference section is + > maxFraction of the total text length from the end + or + < minFraction from the end + then the prediction is considered invalid, and the reference section + is set to '', and the body is not split. + + Does: splitSections() - get the section str. + findSections() - get descriptions of the section ''' # Names of the match/regex types @@ -182,46 +181,46 @@ class ExtTextSplitter (object): #{ # Different journals/articles may have words before "Figure": # any single word, "\w+", or the specific word combos below OPT_FIG_START = r'(?:(?:\w+' + \ - '|' + spacedOutRegex('supp data') + \ - '|' + spacedOutRegex('supplemental data') + \ - '|' + spacedOutRegex('supplementary data') + \ - '|' + spacedOutRegex('extended data') + \ - r') )?' + '|' + spacedOutRegex('supp data') + \ + '|' + spacedOutRegex('supplemental data') + \ + '|' + spacedOutRegex('supplementary data') + \ + '|' + spacedOutRegex('extended data') + \ + r') )?' # Dict defining all the section start tags and their match types # End each w/ \n or \b to force line or word boundaries # The startPattern on TypedRegexMatcher constructor sets '\n' for line start regexDict = { - REF_SECTION_PRIMARY : [spacedOutRegex("References") + '\n', - spacedOutRegex("Literature Cited") + '\n', - spacedOutRegex("References and Notes") + '\n', - ], - REF_SECTION_SECONDARY: [spacedOutRegex("Reference") + '\n', - spacedOutRegex("Acknowledgements") + r'\b', - spacedOutRegex("Acknowledgments") + r'\b', - spacedOutRegex("Conflicts of Interest") + r'\b', - spacedOutRegex("Conflict of Interest") + r'\b', - ], - MANUSCRIPT_FIGURES : [OPT_FIG_START + spacedOutRegex("Figure")+ r'\b', - OPT_FIG_START + spacedOutRegex("Fig") + r'\b', - OPT_FIG_START + spacedOutRegex("Table") + r'\b', - ], - STAR_METHODS : [spacedOutRegex("Star") + "[ ]*[ *+][ ]*" + - spacedOutRegex("Methods") + '\n', - ], - SUPP_DATA : [spacedOutRegex(SUPP_DATA_TAG) + '\n', - ], - } + REF_SECTION_PRIMARY : [spacedOutRegex("References") + '\n', + spacedOutRegex("Literature Cited") + '\n', + spacedOutRegex("References and Notes") + '\n', + ], + REF_SECTION_SECONDARY: [spacedOutRegex("Reference") + '\n', + spacedOutRegex("Acknowledgements") + r'\b', + spacedOutRegex("Acknowledgments") + r'\b', + spacedOutRegex("Conflicts of Interest") + r'\b', + spacedOutRegex("Conflict of Interest") + r'\b', + ], + MANUSCRIPT_FIGURES : [OPT_FIG_START + spacedOutRegex("Figure")+ r'\b', + OPT_FIG_START + spacedOutRegex("Fig") + r'\b', + OPT_FIG_START + spacedOutRegex("Table") + r'\b', + ], + STAR_METHODS : [spacedOutRegex("Star") + "[ ]*[ *+][ ]*" + + spacedOutRegex("Methods") + '\n', + ], + SUPP_DATA : [spacedOutRegex(SUPP_DATA_TAG) + '\n', + ], + } def __init__(self, - minFraction=0.05, # min fraction predicted for ref section - maxFraction=0.4, # max fraction of whole doc that the - # predicted ref section is allowed to be - ): - self.minFraction = minFraction - self.maxFraction = maxFraction - self.matcher = TypedRegexMatcher(self.regexDict, startPattern='\n') - self.initSections('') + minFraction=0.05, # min fraction predicted for ref section + maxFraction=0.4, # max fraction of whole doc that the + # predicted ref section is allowed to be + ): + self.minFraction = minFraction + self.maxFraction = maxFraction + self.matcher = TypedRegexMatcher(self.regexDict, startPattern='\n') + self.initSections('') # ---------------------------------- def getRegexMatcher(self): return self.matcher @@ -229,257 +228,257 @@ def getExtText(self): return self.extText # ---------------------------------- def initSections(self, extText): - """ - initialize all the text sections to missing - """ - self.extText = extText - self.lenExtText = len(extText) - - # body is whole thing for now. - self.bodyS = Section(SECTION_BODY, extText, "body start", 0, - self.lenExtText) - - # mark all other sections as missing for now - self.refsS = Section(SECTION_REFS, '', 'no ref section match', - self.lenExtText, self.lenExtText) - self.mfigS = Section(SECTION_MFIGS, '', 'no manuscript figs match', - self.lenExtText, self.lenExtText) - self.starS = Section(SECTION_STAR, '', 'no star methods match', - self.lenExtText, self.lenExtText) - self.suppS = Section(SECTION_SUPP, '', 'no supp data match', - self.lenExtText, self.lenExtText) + """ + initialize all the text sections to missing + """ + self.extText = extText + self.lenExtText = len(extText) + + # body is whole thing for now. + self.bodyS = Section(SECTION_BODY, extText, "body start", 0, + self.lenExtText) + + # mark all other sections as missing for now + self.refsS = Section(SECTION_REFS, '', 'no ref section match', + self.lenExtText, self.lenExtText) + self.mfigS = Section(SECTION_MFIGS, '', 'no manuscript figs match', + self.lenExtText, self.lenExtText) + self.starS = Section(SECTION_STAR, '', 'no star methods match', + self.lenExtText, self.lenExtText) + self.suppS = Section(SECTION_SUPP, '', 'no supp data match', + self.lenExtText, self.lenExtText) # ---------------------------------- def splitSections(self, extText): - """ - #### if you just want the text of the sections, call this #### - Split the exText, return the sections text - Return the text of the sections tuple: - (body, ref section, manuscript figs, star methods, supp data) - """ - self.findSections(extText) - return (self.bodyS.text, - self.refsS.text, - self.mfigS.text, - self.starS.text, - self.suppS.text, - ) + """ + #### if you just want the text of the sections, call this #### + Split the exText, return the sections text + Return the text of the sections tuple: + (body, ref section, manuscript figs, star methods, supp data) + """ + self.findSections(extText) + return (self.bodyS.text, + self.refsS.text, + self.mfigS.text, + self.starS.text, + self.suppS.text, + ) # ---------------------------------- def findSections(self, extText): - """ - #### if you want details of the sections, call this #### - Find the sections in text. - Set self.bodyS, refsS, mfigS, starS, suppS - to Section objects describing each section - Return the 5 Section objects: - """ - self.initSections(extText) - - matches = self.matcher.match(extText) - if len(matches) != 0: # got some matches - # The order of these calls is important - self.findSuppSection() - self.findStarSection() - self.findRefsSection() - self.findMfigSection() - self.findBodySection() - return self.bodyS, self.refsS, self.mfigS, self.starS, self.suppS + """ + #### if you want details of the sections, call this #### + Find the sections in text. + Set self.bodyS, refsS, mfigS, starS, suppS + to Section objects describing each section + Return the 5 Section objects: + """ + self.initSections(extText) + + matches = self.matcher.match(extText) + if len(matches) != 0: # got some matches + # The order of these calls is important + self.findSuppSection() + self.findStarSection() + self.findRefsSection() + self.findMfigSection() + self.findBodySection() + return self.bodyS, self.refsS, self.mfigS, self.starS, self.suppS # ---------------------------------- def findSuppSection(self): - """ - Set self.suppS - Assumes: - self.suppS is initialized to be length 0 at end of self.extText - """ - section = self.suppS - matches = self.matcher.getMatches(self.SUPP_DATA) - if len(matches) != 0: # matched supp data start tags - m = matches[-1] # use last match - - section.reason = m.text - section.sPos = m.sPos - section.ePos = self.lenExtText - section.text = self.extText[section.sPos : section.ePos] - - # else assume self.suppS is already initialized correctly - return + """ + Set self.suppS + Assumes: + self.suppS is initialized to be length 0 at end of self.extText + """ + section = self.suppS + matches = self.matcher.getMatches(self.SUPP_DATA) + if len(matches) != 0: # matched supp data start tags + m = matches[-1] # use last match + + section.reason = m.text + section.sPos = m.sPos + section.ePos = self.lenExtText + section.text = self.extText[section.sPos : section.ePos] + + # else assume self.suppS is already initialized correctly + return # ---------------------------------- def findStarSection(self): - """ - Set self.starS - Assumes: - self.suppS is set appropriately - self.starS is initialized to be length 0 at end of self.extText - """ - section = self.starS - allMatches = self.matcher.getMatches(self.STAR_METHODS) - - matches = [] - for m in allMatches: # collect matches before supp data start - if m.sPos < self.suppS.sPos: matches.append(m) - else: break - - if len(matches) == 0: # no star methods match - section.sPos = self.suppS.sPos - section.ePos = self.suppS.sPos - else: # got a match - matches.reverse() - m = self.findNotTooLateMatch(matches) - if m == None: # no reasonable match - section.reason = 'star methods too close to end (%d)' % \ - matches[-1].sPos - section.sPos = self.suppS.sPos - section.ePos = self.suppS.sPos - else: # got a good one - section.reason = m.text - section.sPos = m.sPos - section.ePos = self.suppS.sPos - - section.text = self.extText[section.sPos : section.ePos] - return + """ + Set self.starS + Assumes: + self.suppS is set appropriately + self.starS is initialized to be length 0 at end of self.extText + """ + section = self.starS + allMatches = self.matcher.getMatches(self.STAR_METHODS) + + matches = [] + for m in allMatches: # collect matches before supp data start + if m.sPos < self.suppS.sPos: matches.append(m) + else: break + + if len(matches) == 0: # no star methods match + section.sPos = self.suppS.sPos + section.ePos = self.suppS.sPos + else: # got a match + matches.reverse() + m = self.findNotTooLateMatch(matches) + if m == None: # no reasonable match + section.reason = 'star methods too close to end (%d)' % \ + matches[-1].sPos + section.sPos = self.suppS.sPos + section.ePos = self.suppS.sPos + else: # got a good one + section.reason = m.text + section.sPos = m.sPos + section.ePos = self.suppS.sPos + + section.text = self.extText[section.sPos : section.ePos] + return # ---------------------------------- def findRefsSection(self): - """ - Set self.refsS - Assumes: - self.starS is set appropriately - self.refsS is initialized to be length 0 at end of self.extText - """ - section = self.refsS - primary = self.matcher.getMatches(self.REF_SECTION_PRIMARY) - secondary = self.matcher.getMatches(self.REF_SECTION_SECONDARY) - - m, primaryReason = self.findRefsMatch(primary) - - if m: # got a good primary match - section.reason = primaryReason - section.sPos = m.sPos - section.ePos = self.starS.sPos - - elif len(secondary) == 0: # no good primary, and no secondary - section.reason = primaryReason - section.sPos = self.starS.sPos - section.ePos = self.starS.sPos - else: # no good primary, but some secondary - m, secondaryReason = self.findRefsMatch(secondary) - - if m: # got good secondary match - section.reason = secondaryReason - section.sPos = m.sPos - section.ePos = self.starS.sPos - else: # no good secondary match either - section.reason = primaryReason + '; \n' + secondaryReason - section.sPos = self.starS.sPos - section.ePos = self.starS.sPos - - section.text = self.extText[section.sPos : section.ePos] - return + """ + Set self.refsS + Assumes: + self.starS is set appropriately + self.refsS is initialized to be length 0 at end of self.extText + """ + section = self.refsS + primary = self.matcher.getMatches(self.REF_SECTION_PRIMARY) + secondary = self.matcher.getMatches(self.REF_SECTION_SECONDARY) + + m, primaryReason = self.findRefsMatch(primary) + + if m: # got a good primary match + section.reason = primaryReason + section.sPos = m.sPos + section.ePos = self.starS.sPos + + elif len(secondary) == 0: # no good primary, and no secondary + section.reason = primaryReason + section.sPos = self.starS.sPos + section.ePos = self.starS.sPos + else: # no good primary, but some secondary + m, secondaryReason = self.findRefsMatch(secondary) + + if m: # got good secondary match + section.reason = secondaryReason + section.sPos = m.sPos + section.ePos = self.starS.sPos + else: # no good secondary match either + section.reason = primaryReason + '; \n' + secondaryReason + section.sPos = self.starS.sPos + section.ePos = self.starS.sPos + + section.text = self.extText[section.sPos : section.ePos] + return # ---------------------------------- def findRefsMatch(self, allMatches): - """ - Find a good match in 'allMatches'. - Return the good match object (or None) + reason - Assumes: matches is sorted from start of doc to end - """ - matches = [] - for m in allMatches: # collect matches before star methods start - if m.sPos < self.starS.sPos: matches.append(m) - else: break - - if len(matches) != 0: # matched refs start tags - matches.reverse() # find last occurances first - m = self.findNotTooLateMatch(matches) - if m != None: # match that is not too late - - # is length of refs too big? - refLength = self.starS.sPos - m.sPos - if float(refLength)/float(self.starS.sPos) <= self.maxFraction: - reason = m.text - else: # refs section too big - reason = "refs match is too early: '%s' (%d)" % \ - (m.text, m.sPos) - m = None - else: # no good match - reason ="refs matches too close to end, earliest: '%s' (%d)" % \ - (matches[-1].text, matches[-1].sPos) - m = None - else: # no start tag found - m = None - reason = 'no refs match' - - return m, reason + """ + Find a good match in 'allMatches'. + Return the good match object (or None) + reason + Assumes: matches is sorted from start of doc to end + """ + matches = [] + for m in allMatches: # collect matches before star methods start + if m.sPos < self.starS.sPos: matches.append(m) + else: break + + if len(matches) != 0: # matched refs start tags + matches.reverse() # find last occurances first + m = self.findNotTooLateMatch(matches) + if m != None: # match that is not too late + + # is length of refs too big? + refLength = self.starS.sPos - m.sPos + if float(refLength)/float(self.starS.sPos) <= self.maxFraction: + reason = m.text + else: # refs section too big + reason = "refs match is too early: '%s' (%d)" % \ + (m.text, m.sPos) + m = None + else: # no good match + reason ="refs matches too close to end, earliest: '%s' (%d)" % \ + (matches[-1].text, matches[-1].sPos) + m = None + else: # no start tag found + m = None + reason = 'no refs match' + + return m, reason # ---------------------------------- def findMfigSection(self): - """ - Look forward from refs section to star methods & see if any - figures/tables in between. - If so, truncate refs section at first figure/table start. - set the mfigS to be the fig start to star methods start. - Assume: - self.starS is set appropriately - self.mfigS is initialized to be length 0 at end of self.extText - """ - section = self.mfigS - matches = self.matcher.getMatches(self.MANUSCRIPT_FIGURES) - - figMatch = None # the match of the 1st fig after refs start - for m in matches: - if self.refsS.sPos < m.sPos and m.sPos < self.starS.sPos: - figMatch = m - break - if figMatch: # got a fig after refs & before any starS - section.reason = figMatch.text - section.sPos = figMatch.sPos - section.ePos = self.starS.sPos - section.text = self.extText[section.sPos : section.ePos] - - self.refsS.ePos = figMatch.sPos # adjust end of refs section - self.refsS.text = self.extText[self.refsS.sPos: self.refsS.ePos] - else: # no fig match - section.sPos = self.starS.sPos - section.ePos = self.starS.sPos - # section.text & reason should be set ok from initSections - return + """ + Look forward from refs section to star methods & see if any + figures/tables in between. + If so, truncate refs section at first figure/table start. + set the mfigS to be the fig start to star methods start. + Assume: + self.starS is set appropriately + self.mfigS is initialized to be length 0 at end of self.extText + """ + section = self.mfigS + matches = self.matcher.getMatches(self.MANUSCRIPT_FIGURES) + + figMatch = None # the match of the 1st fig after refs start + for m in matches: + if self.refsS.sPos < m.sPos and m.sPos < self.starS.sPos: + figMatch = m + break + if figMatch: # got a fig after refs & before any starS + section.reason = figMatch.text + section.sPos = figMatch.sPos + section.ePos = self.starS.sPos + section.text = self.extText[section.sPos : section.ePos] + + self.refsS.ePos = figMatch.sPos # adjust end of refs section + self.refsS.text = self.extText[self.refsS.sPos: self.refsS.ePos] + else: # no fig match + section.sPos = self.starS.sPos + section.ePos = self.starS.sPos + # section.text & reason should be set ok from initSections + return # ---------------------------------- def findBodySection(self): - section = self.bodyS - section.sPos = 0 - section.ePos = self.refsS.sPos - section.text = self.extText[section.sPos : section.ePos] - return + section = self.bodyS + section.sPos = 0 + section.ePos = self.refsS.sPos + section.text = self.extText[section.sPos : section.ePos] + return # ---------------------------------- def findNotTooLateMatch(self, matches, totalTextLength='default'): - """ - Given a list of matches, return the 1st one that is not too close - to the end. - (at least self.minFactor from textEnd JIM: doc better) - Return None if we don't find one - """ - if totalTextLength == 'default': totalTextLength = self.lenExtText - retVal = None - for m in matches: - if not self.isTooCloseToEnd(m.sPos, totalTextLength): - retVal = m - break - return retVal + """ + Given a list of matches, return the 1st one that is not too close + to the end. + (at least self.minFactor from textEnd JIM: doc better) + Return None if we don't find one + """ + if totalTextLength == 'default': totalTextLength = self.lenExtText + retVal = None + for m in matches: + if not self.isTooCloseToEnd(m.sPos, totalTextLength): + retVal = m + break + return retVal # ---------------------------------- def isTooCloseToEnd(self, sPos, totalTextLength='default'): - """ + """ Return Boolean: Is the predicted (section) start position too - close to the end to be reasonable? + close to the end to be reasonable? - (if too close, it is likely some text in the PDF page footer) + (if too close, it is likely some text in the PDF page footer) """ - if totalTextLength == 'default': totalTextLength = self.lenExtText + if totalTextLength == 'default': totalTextLength = self.lenExtText sectionLen = totalTextLength - sPos sectionLengthFraction = float(sectionLen)/totalTextLength return sectionLengthFraction < self.minFraction @@ -493,114 +492,114 @@ class TypedMatch (object): Represents a match from a TypedRegexMatcher. """ def __init__(self, matchType, text, sPos, ePos): - self.matchType = matchType # types from the regexDict passed - # to TypedRegexMatcher - self.text = text # the string that matched the regex - self.sPos = sPos # start pos in the text of matching str - self.ePos = ePos # end pos in the text of matching str - # i.e., 1st index in text after match + self.matchType = matchType # types from the regexDict passed + # to TypedRegexMatcher + self.text = text # the str.that matched the regex + self.sPos = sPos # start pos in the text of matching str + self.ePos = ePos # end pos in the text of matching str + # i.e., 1st index in text after match def __str__(self): - return "TypedMatched object: %s '%s' %d %d" % \ - (self.matchType, self.text, self.sPos, self.ePos) + return "TypedMatched object: %s '%s' %d %d" % \ + (self.matchType, self.text, self.sPos, self.ePos) #----------------------------------------------- class TypedRegexMatcher (object): #{ """ - Is: A class that matches a set of regex's against strings. - Each regex has a user defined type/category name. - Once you have matched against a string, you can get back the matches - by type, in the order the matches occur in the string. + Is: A class that matches a set of regex's against str.. + Each regex has a user defined type/category name. + Once you have matched against a str. you can get back the matches + by type, in the order the matches occur in the str. - This is built to make one regex match pass over a string once, and - yet pull out all the individual matches, by type. + This is built to make one regex match pass over a str.once, and + yet pull out all the individual matches, by type. Has: Dict of typed regex's: - {'type 1' : [regex pattern strings...], - 'type 2' : [regex pattern strings...], - ... - } - A honking regex built from this dict + {'type 1' : [regex pattern str....], + 'type 2' : [regex pattern str....], + ... + } + A honking regex built from this dict Does: match('sometext') - After a match: - getMatches('type'), getAllMatches() - return lists of TypedMatch objects in the order they appear - in 'sometext' + After a match: + getMatches('type'), getAllMatches() + return lists of TypedMatch objects in the order they appear + in 'sometext' """ # ----------------------- def __init__(self, - regexDict, # Dict of regex's as above - startPattern='', # Regex pattern str to match at the - # start of all regex's. - # Use this to force matches at start - # of paragraphs. - flags=re.IGNORECASE, # Regex flags when matching, see re. - # flags=0 to get re module defaults - ): - self.regexDict = regexDict - self.regexTypes = self.regexDict.keys() - self.startPattern = startPattern - self.flags = flags - - self.buildRegexStr() - self.regex = re.compile(self.regexStr, self.flags) - - self.initMatchResults() + regexDict, # Dict of regex's as above + startPattern='', # Regex pattern str to match at the + # start of all regex's. + # Use this to force matches at start + # of paragraphs. + flags=re.IGNORECASE, # Regex flags when matching, see re. + # flags=0 to get re module defaults + ): + self.regexDict = regexDict + self.regexTypes = list(self.regexDict.keys()) + self.startPattern = startPattern + self.flags = flags + + self.buildRegexStr() + self.regex = re.compile(self.regexStr, self.flags) + + self.initMatchResults() # ---------------------------------- def buildRegexStr(self): - """ - Set self.refRegex to the honking regex... - Each regex type is its own named regex group. - """ - regexParts = [] - for tType,regList in self.regexDict.items(): - rs = r'(?P<%s>%s)' % ( tType, '|'.join(regList) ) - regexParts.append(rs) - - self.regexStr = self.startPattern + '(?:' + '|'.join(regexParts) + ')' + """ + Set self.refRegex to the honking regex... + Each regex type is its own named regex group. + """ + regexParts = [] + for tType,regList in list(self.regexDict.items()): + rs = r'(?P<%s>%s)' % ( tType, '|'.join(regList) ) + regexParts.append(rs) + + self.regexStr = self.startPattern + '(?:' + '|'.join(regexParts) + ')' # ---------------------------------- def initMatchResults(self): - """ - Initialize matchesByType and allMatches to empty lists - """ - self.matchesByType = {} - for t in self.regexTypes: - self.matchesByType[t] = [] - self.allMatches = [] + """ + Initialize matchesByType and allMatches to empty lists + """ + self.matchesByType = {} + for t in self.regexTypes: + self.matchesByType[t] = [] + self.allMatches = [] # ---------------------------------- def match(self, text): - """ - Match the regex's against the text. - Return the list of all matches (TypeMatch objects) - """ - self.initMatchResults() + """ + Match the regex's against the text. + Return the list of all matches (TypeMatch objects) + """ + self.initMatchResults() - for reM in self.regex.finditer(text): # for the regex Match objects + for reM in self.regex.finditer(text): # for the regex Match objects - # for the named groups: - # Note all named groups are in the groupdict, - # even if there is no match to that group - for mType, mText in reM.groupdict().items(): - if mText != None: break # when we find group w/ a value, - # that is the matching group + # for the named groups: + # Note all named groups are in the groupdict, + # even if there is no match to that group + for mType, mText in list(reM.groupdict().items()): + if mText != None: break # when we find group w/ a value, + # that is the matching group - sPos, ePos = reM.span(mType) - m = TypedMatch(mType, mText, sPos, ePos) # our own match object + sPos, ePos = reM.span(mType) + m = TypedMatch(mType, mText, sPos, ePos) # our own match object - self.allMatches.append(m) - self.matchesByType[mType].append(m) + self.allMatches.append(m) + self.matchesByType[mType].append(m) - return self.allMatches + return self.allMatches # ---------------------------------- def getMatches(self, regexType): - if regexType not in self.regexTypes: - raise KeyError("invalid match type '%s'" % regexType) - return self.matchesByType[regexType] + if regexType not in self.regexTypes: + raise KeyError("invalid match type '%s'" % regexType) + return self.matchesByType[regexType] def getAllMatches(self): return self.allMatches def getRegexStr(self): return self.regexStr @@ -613,68 +612,68 @@ def getRegexTypes(self): return self.regexTypes # ----------------------- if __name__ == "__main__": # some ad hoc tests - print "Running ad hoc tests - modify these as needed" + print("Running ad hoc tests - modify these as needed") if False: # TypedRegexMatcher tests - PARA_BOUNDARY = '\n\n' - regexDict = { - 'animal' : [spacedOutRegex('duck'), - r'\bdog\b'], - 'tree' : ['[oO]ak|fir', 'apple', 'beech',], - } - #matcher = TypedRegexMatcher(regexDict, startPattern=PARA_BOUNDARY) - matcher = TypedRegexMatcher(regexDict, ) - - print matcher.getRegexStr() - - s = 'the \n\ndu ck\n and DoG climbed an Oak tree' - #s = 'no matches here tree' - - print len(matcher.match(s)) - for m in matcher.getAllMatches(): - print str(m) - print - - for t in regexDict.keys(): - print "%s:\n[" % t - for m in matcher.getMatches(t): - print str(m) - print '] ------' - #x = matcher.getMatches('foo') # test using invalid regex type + PARA_BOUNDARY = '\n\n' + regexDict = { + 'animal' : [spacedOutRegex('duck'), + r'\bdog\b'], + 'tree' : ['[oO]ak|fir', 'apple', 'beech',], + } + #matcher = TypedRegexMatcher(regexDict, startPattern=PARA_BOUNDARY) + matcher = TypedRegexMatcher(regexDict, ) + + print(matcher.getRegexStr()) + + s = 'the \n\ndu ck\n and DoG climbed an Oak tree' + #s = 'no matches here tree' + + print(len(matcher.match(s))) + for m in matcher.getAllMatches(): + print(str(m)) + print() + + for t in list(regexDict.keys()): + print("%s:\n[" % t) + for m in matcher.getMatches(t): + print(str(m)) + print('] ------') + #x = matcher.getMatches('foo') # test using invalid regex type if True: # ExtTextSplitter tests - def runSectionTest(sp, doc): - print "--------- doc length: %d" % len(doc) - (bodyS, refsS, mfigS, starS, suppS) = sp.findSections(doc) - print str(bodyS) - print str(refsS) - print str(mfigS) - print str(starS) - print str(suppS) - # ------- + def runSectionTest(sp, doc): + print("--------- doc length: %d" % len(doc)) + (bodyS, refsS, mfigS, starS, suppS) = sp.findSections(doc) + print(str(bodyS)) + print(str(refsS)) + print(str(mfigS)) + print(str(starS)) + print(str(suppS)) + # ------- # PARA_BOUNDARY + 'References' + \ # "\n1234567890" + \ # PARA_BOUNDARY + 'star*methods' + \ # PARA_BOUNDARY + 'star*methods' + \ - doc = "1234567890" + \ - '\nfigure 1: here is a legend' + \ - '\n' + 'references' + \ - "\n1234567890" + \ - '\n' + 'conf licts of int erest' + \ - "\n1234567890" + \ - '\nsupplementary data TABLE 2: here is a legend' + \ - "\n1234567890" + \ - '\nfigure 3: here is a legend' + \ - "\n1234567890" + \ - '\n' + 'star*methods' + \ - "\n1234567890" + \ - '\n' + SUPP_DATA_TAG + \ - "\n1234567890" \ - '\n' + 'star*methods' + \ - "\n1234567890" - #doc = open('6114980.txt', 'r').read() - #doc = "1234567890" + PARA_BOUNDARY + 'foo' + "\n1234567890" - sp = ExtTextSplitter(maxFraction=0.9, minFraction=.1) - #print sp.getRegexMatcher().getRegexStr() - runSectionTest(sp, doc) + doc = "1234567890" + \ + '\nfigure 1: here is a legend' + \ + '\n' + 'references' + \ + "\n1234567890" + \ + '\n' + 'conf licts of int erest' + \ + "\n1234567890" + \ + '\nsupplementary data TABLE 2: here is a legend' + \ + "\n1234567890" + \ + '\nfigure 3: here is a legend' + \ + "\n1234567890" + \ + '\n' + 'star*methods' + \ + "\n1234567890" + \ + '\n' + SUPP_DATA_TAG + \ + "\n1234567890" \ + '\n' + 'star*methods' + \ + "\n1234567890" + #doc = open('6114980.txt', 'r').read() + #doc = "1234567890" + PARA_BOUNDARY + 'foo' + "\n1234567890" + sp = ExtTextSplitter(maxFraction=0.9, minFraction=.1) + #print sp.getRegexMatcher().getRegexStr() + runSectionTest(sp, doc)