From 843dacf79f2e681f5f98e12c6589917ea47cc3f2 Mon Sep 17 00:00:00 2001
From: Lori Corbani <lori.corbani@jax.org>
Date: Thu, 2 Apr 2020 07:29:03 -0400
Subject: [PATCH] TR13204/Infrastructure/python 3

---
 HttpRequestGovernor.py   |   9 +-
 Install                  |   5 +-
 PdfParser.py             | 549 +++++++++++++------------
 Pdfpath.py               |  45 +-
 PubMedAgent.py           | 545 +++++++++++++------------
 PubMedCentralAgent.py    |  15 +-
 extractedTextSplitter.py | 857 +++++++++++++++++++--------------------
 7 files changed, 1008 insertions(+), 1017 deletions(-)

diff --git a/HttpRequestGovernor.py b/HttpRequestGovernor.py
index 29edeb4..40b2714 100755
--- a/HttpRequestGovernor.py
+++ b/HttpRequestGovernor.py
@@ -1,4 +1,3 @@
-# Name: HttpRequestGovernor.py
 # Purpose: provides a class for managing the frequency with which we can make HTTP requests,
 #    to ensure a configurable amount of "niceness" when reading from other sites
 # Notes:
@@ -14,7 +13,7 @@
 #    You can also ask the governor to report on its statistics so far.
 
 import time
-import urllib2
+import urllib.request, urllib.error, urllib.parse
 import runCommand
 
 # constants for convenience
@@ -31,7 +30,7 @@
 def readURL (url):
     # Purpose: given constraints on reading from https connections in python 2.7, we're just going
     #    to shell out and use curl for this
-    # Returns: string returned
+    # Returns: str.returned
     # Throws: Exception if we have problems reading from 'url'
 
     stdout, stderr, statusCode = runCommand.runCommand("curl '%s'" % url)
@@ -135,7 +134,7 @@ def get (self, url):
         
         try:
             response = readURL(url)
-        except Exception, e:
+        except Exception as e:
             raise Exception('The server could not fulfill the request: %s' % str(e))
         return response
     
@@ -150,4 +149,4 @@ def getStatistics (self):
             'Average wait time:  %6.3f sec' % (sum(self.timesWaited) / self.requestCount),
             'Maximum wait time:  %6.3f sec' % max(self.timesWaited),
             ]
-        return stats
\ No newline at end of file
+        return stats
diff --git a/Install b/Install
index 282f04a..8d07993 100755
--- a/Install
+++ b/Install
@@ -48,7 +48,7 @@ fi
 #
 # Compile all Python scripts.
 #
-python -c 'import compileall; compileall.compile_dir(".")'
+${PYTHON} -m compileall -l -f .
 if [ $? -ne 0 ]
 then
     echo "Error compiling Python source"
@@ -59,12 +59,11 @@ fi
 # Set the proper permissions on the Python files.
 #
 chmod 775 *.py
-chmod 664 *.pyc
 
 #
 # Copy the Python files to the given library directory.
 #
-for FILE in `ls *.py *.pyc`
+for FILE in `ls *.py`
 do
     rm -f ${LIBRARY_DIRECTORY}/${FILE}
     cp -p ${FILE} ${LIBRARY_DIRECTORY}
diff --git a/PdfParser.py b/PdfParser.py
index 4a05939..94caa1d 100755
--- a/PdfParser.py
+++ b/PdfParser.py
@@ -1,4 +1,3 @@
-# Name: pdfParser.py
 # Purpose: provides functions for extracting text from PDF files
 # Notes: 
 #	1. relies on MGI's litparser product to do the actual processing
@@ -51,282 +50,282 @@
 ###--- Functions ---###
 
 def setLitParserDir (
-	directory	# string; path to the litparser product
-	):
-	# Purpose: initialize this module by identifying where to find the
-	#	litparser product.
-	# Throws: Exception if 'directory' does not exist or if it does not
-	#	contain the expected pdfGetFullText.sh script.
-
-	global LITPARSER
-
-	if not os.path.isdir(directory):
-		raise Exception('%s is not a directory' % directory)
-
-	LITPARSER = os.path.join(directory, 'pdfGetFullText.sh')
-	if not os.path.exists(LITPARSER):
-		raise Exception('%s does not exist' % LITPARSER)
-	return
-	
+        directory	# str. path to the litparser product
+        ):
+        # Purpose: initialize this module by identifying where to find the
+        #	litparser product.
+        # Throws: Exception if 'directory' does not exist or if it does not
+        #	contain the expected pdfGetFullText.sh script.
+
+        global LITPARSER
+
+        if not os.path.isdir(directory):
+                raise Exception('%s is not a directory' % directory)
+
+        LITPARSER = os.path.join(directory, 'pdfGetFullText.sh')
+        if not os.path.exists(LITPARSER):
+                raise Exception('%s does not exist' % LITPARSER)
+        return
+        
 def hyphenate (s):
-	# Purpose: fix the hyphenation in Blood DOI IDs, which should be
-	#	of the format "-yyyy-mm-others" where the first six digits
-	#	are the year, the next two are the month, and then all the
-	#	others come at the end
-	# Returns: string updated according to 'Purpose', or the input string
-	#	if there are not enough digits
-
-	digits = s.replace('-', '').replace('.', '').replace(' ', '')
-	if len(digits) < 7:
-		return s
-	if s.find('.') >= 0:
-		return '.%s%s%s' % (digits[:4], digits[4:6], digits[6:])
-	else:
-		return '-%s-%s-%s' % (digits[:4], digits[4:6], digits[6:])
+        # Purpose: fix the hyphenation in Blood DOI IDs, which should be
+        #	of the format "-yyyy-mm-others" where the first six digits
+        #	are the year, the next two are the month, and then all the
+        #	others come at the end
+        # Returns: str.updated according to 'Purpose', or the input string
+        #	if there are not enough digits
+
+        digits = s.replace('-', '').replace('.', '').replace(' ', '')
+        if len(digits) < 7:
+                return s
+        if s.find('.') >= 0:
+                return '.%s%s%s' % (digits[:4], digits[4:6], digits[6:])
+        else:
+                return '-%s-%s-%s' % (digits[:4], digits[4:6], digits[6:])
 
 ###--- Classes ---###
 
 class PdfParser:
-	# Is: a parser that knows how to extract text from a PDF file
-	# Has: path to a PDF file, text from a PDF file
-	# Does: reads a PDF file from the file system, parses it, provides
-	#	access to full text and various bits of information
-
-	def __init__ (self,
-		pdfPath		# string; path to PDF file to parse
-		):
-		# Purpose: constructor
-		# Throws: Exception if the file specified in 'pdfPath' does
-		#	not exist
-
-		if not os.path.exists(pdfPath):
-			raise Exception('PDF file does not exist: %s' % pdfPath)
-
-		self.pdfPath = pdfPath	# string; path to the PDF file
-		self.fullText = None	# string; text from the PDF file
-		self.loaded = False	# boolean; did we read the file yet?
-		return
-
-	def _loadFullText (self):
-		# Purpose: (private) get the text from the PDF file
-		# Throws: Exception if this library has not been properly
-		#	initialized or if there are errors in parsing the file
-		# Notes: only loads the file once; if we already ready it,
-		#	calling this function is a no-op.
-
-		if self.loaded:
-			return
-
-		if not LITPARSER:
-			raise Exception('Must initialize pdfParser library using setLitParserDir()')
-
-		cmd = '%s %s' % (LITPARSER, self.pdfPath)
-		try:
-			(stdout, stderr, exitCode) = runCommand.runCommand(cmd)
-		except:
-			# error in attempting to execute parsing script
-			raise Exception('Failed to execute: %s' % cmd)
-
-		# parsing script finished with an error code?
-		if (exitCode != 0):
-			raise Exception('Failed to parse %s' % self.pdfPath)
-
-		# parsing was successful, so grab the text and note that we
-		# loaded the file
-
-		self.fullText = stdout
-		self.loaded = True
-		return
-
-	def getFirstDoiID (self):
-		# Purpose: return the first DOI ID from the PDF file
-		# Returns: string DOI ID or None (if no ID can be found)
-		# Throws: Exception if this library has not been properly
-		#	initialized or if there are errors in parsing the file
-		# Note: this would be more aptly named getDoiID()
-
-		self._loadFullText()
-
-		if self.fullText:
-
-			# PNAS only
-			if self.fullText.find('www.pnas.org') >= 0:
-
-			    	match = PNAS_DOI_RE.search(self.fullText)
-				doiID = match.group(1)
-
-				# may have DCSuppoemental
-				try:
-					if self.fullText.find('DCSupplemental') >= 0:
-						doiID = match.group(2)
-				except:
-					pass
-
-			    	# PNAS DOI sometimes have missing '/' so can't be found using DOI_RE
-			    	# determine if missing '/' OR intervening SINGLE non-alphnumeric char
-				# if no '/' 
-				if doiID.find('/') == -1:
-				  	if doiID.find('pnas') == 7: # there is no '/', add one
-						doiID = doiID.replace('10.1073', '10.1073/')
-				   	elif doiID.find('pnas') == 8: # there is a single intervening char
-						charToReplace = doiID[7]
-						doiID = doiID.replace(charToReplace, '/')
-				return doiID
-
-			# all else
-			else:
-				self.fullText = self.fullText.replace(' journal.pone', 'journal.pone')
-				match = DOI_RE.search(self.fullText)
-
-			if match:
-				doiID = match.group(1)
-				slash = doiID.find('/')
-				nl = doiID.find('\n')
-
-				# special case for PLoS journals, which often have a line break in the ID.
-				# PLOS journals have 28-character DOI IDs 99.98% of the time.  Out of 10,000+
-				# PLOS DOI IDs in MGI so far, the only others are single IDs with 21 and 24
-				# characters.  So if we encounter a newline within the first 21 characters,
-				# we can just remove it.
-				# Also as of new pdftotext util we started using in Oct 2019, the 1st or 2nd
-				#  ID occurrance in the paper may be truncated when a space is inserted
-				#  instead of a line break. So try looking for a couple ID instances.
-
-				if doiID.startswith('10.1371/'):
-					if (0 <= nl < 21):	# remove potential nl
-						doiID = doiID.replace('\n', '', 1)
-						slash = doiID.find('/')
-						nl = doiID.find('\n')
-					i = 0
-					while len(doiID) < 28:		# try another occurrance
-						if i == 3: break	# quit after 3 tries
-						i += 1
-
-						match = DOI_RE.search(self.fullText, match.end())
-						if not match: break	# odd, this shouldn't happen, bail
-						doiID = match.group(1)
-						slash = doiID.find('/')
-						nl = doiID.find('\n')
-
-						if (0 <= nl < 21):	# remove potential nl
-							doiID = doiID.replace('\n', '', 1)
-							slash = doiID.find('/')
-							nl = doiID.find('\n')
-
-				# special case for Molecular and Cellular Biology journal, which has DOI IDs
-				# from 20 to 32 characters -- but which are often interrupted by line breaks
-				# in their new (circa late-2016) PDF format.  As a workaround for the most
-				# common case, remove any newlines within the first 20 characters of the ID.
-				
-				if doiID.startswith('10.1128/'):
-					while 0 <= nl < 20:
-						doiID = doiID.replace('\n', '', 1)
-						nl = doiID.find('\n')
-				
-				# if there is a newline right after the slash,
-				# just remove it
-
-				if (nl >= 0) and (nl == (slash+1)):
-					doiID = doiID.replace('\n', '', 1)
-					nl = doiID.find('\n')
-
-				# if there is a newline later in the string,
-				# trim the ID at that point
-
-				if (nl >= 0) and (nl > slash):
-					doiID = doiID[:nl]
-
-				# strip off trailing parentheses, periods, 
-				# brackets, and whitespace
-				doiID = re.sub('[\)\.\]\s]+$', '', doiID)
-
-				# eLife IDs often errantly end with .001
-				if (doiID.find('/eLife') > 0) and (doiID.endswith('.001')):
-					doiID = doiID[:-4]
-
-				# if this is a Blood DOI ID, 
-				# the hypenation sometimes needs tweaking
-				# may contain a '.' or a ' '
-				if doiID.startswith('10.1182/blood'):
-					match = BLOOD_DOI_RE.search(self.fullText)
-					doiID = match.group(0)
-					numbers = match.group(1)
-					revised = hyphenate(numbers)
-					doiID = doiID.replace(numbers, revised)
-					doiID = doiID.replace(' ', '')
-					doiID = doiID.replace('\n', '')
-
-				if doiID.startswith('10.1172/jci'):
-					match = JCI_DOI_RE.search(self.fullText)
-					doiID = match.group(0)
-					doiID = doiID.replace(' ', '')
-
-				if doiID.startswith('10.1530/REP'):
-					match = REP_DOI_RE.search(self.fullText)
-					doiID = match.group(0)
-					doiID = doiID.replace('doi.org/', '')
-					doiID = doiID.replace(' ', '')
-
-				# if this is a 10.1177/...Journal DOI ID, 
-				# then remove the trailing 'Journal' text
-				match = JOURNAL_DOI_RE.match(doiID)
-				if match:
-					doiID = doiID.replace('Journal', '')
-
-				# if this is a Science DOI ID, we instead need
-				# to find and return the last DOI ID for the
-				# PDF file.
-				if doiID.startswith('10.1126/science') or \
-					doiID.startswith('10.1126/scisignal'):
-				        doiID =  self._getScienceID()
-
-				return doiID
-		return None
-
-	def _getScienceID (self):
-		# Science journals include the end of the prior article at the
-		# start of the PDF file.  This means that we will usually
-		# return an inaccurate DOI ID for PDFs from Science journals.
-		# Instead, the desired ID occurs at the end of the article,
-		# shortly after the word "accepted".  Use these criteria to
-		# get the desired ID and return it.
-
-		# To get to this method, we must have already loaded the
-		# full text, and it must have been non-null.
-
-		# Find all occurrences of the word 'accepted' and note the
-		# position of each.  (It is possible that 'accepted' would
-		# occur in the start of the next article, so we can't just
-		# blindly take the last one.)
-
-		acceptedPositions = []
-		match = ACCEPTED_RE.search(self.fullText)
-		while match:
-			pos = match.regs[0][0]
-			acceptedPositions.append(pos)
-			match = ACCEPTED_RE.search(self.fullText, pos + 1)
-
-		# Now start at the last occurrence of "accepted" and see if
-		# we can find a Science DOI ID reasonably soon after it.  If
-		# so, that's our desired ID to return.  If not, work back
-		# through the other instances of "accepted".
-
-		# how close is close enough? (number of characters)
-		threshold = 80
-		acceptedPositions.reverse()
-
-		for accPos in acceptedPositions:
-			match = SCIENCE_DOI_RE.search(self.fullText, accPos)
-			if match:
-				if (match.regs[0][0] <= (accPos + threshold)):
-					return match.group(1)
-		return None 
-
-	def getText (self):
-		# Purpose: return the full text extracted from the PDF file
-		# Returns: string (full text)
-
-		self._loadFullText()
-		if self.fullText:
-			return self.fullText
-		return None
+        # Is: a parser that knows how to extract text from a PDF file
+        # Has: path to a PDF file, text from a PDF file
+        # Does: reads a PDF file from the file system, parses it, provides
+        #	access to full text and various bits of information
+
+        def __init__ (self,
+                pdfPath		# str. path to PDF file to parse
+                ):
+                # Purpose: constructor
+                # Throws: Exception if the file specified in 'pdfPath' does
+                #	not exist
+
+                if not os.path.exists(pdfPath):
+                        raise Exception('PDF file does not exist: %s' % pdfPath)
+
+                self.pdfPath = pdfPath	# str. path to the PDF file
+                self.fullText = None	# str. text from the PDF file
+                self.loaded = False	# boolean; did we read the file yet?
+                return
+
+        def _loadFullText (self):
+                # Purpose: (private) get the text from the PDF file
+                # Throws: Exception if this library has not been properly
+                #	initialized or if there are errors in parsing the file
+                # Notes: only loads the file once; if we already ready it,
+                #	calling this function is a no-op.
+
+                if self.loaded:
+                        return
+
+                if not LITPARSER:
+                        raise Exception('Must initialize pdfParser library using setLitParserDir()')
+
+                cmd = '%s %s' % (LITPARSER, self.pdfPath)
+                try:
+                        (stdout, stderr, exitCode) = runCommand.runCommand(cmd)
+                except:
+                        # error in attempting to execute parsing script
+                        raise Exception('Failed to execute: %s' % cmd)
+
+                # parsing script finished with an error code?
+                if (exitCode != 0):
+                        raise Exception('Failed to parse %s' % self.pdfPath)
+
+                # parsing was successful, so grab the text and note that we
+                # loaded the file
+
+                self.fullText = stdout
+                self.loaded = True
+                return
+
+        def getFirstDoiID (self):
+                # Purpose: return the first DOI ID from the PDF file
+                # Returns: str.DOI ID or None (if no ID can be found)
+                # Throws: Exception if this library has not been properly
+                #	initialized or if there are errors in parsing the file
+                # Note: this would be more aptly named getDoiID()
+
+                self._loadFullText()
+
+                if self.fullText:
+
+                        # PNAS only
+                        if self.fullText.find('www.pnas.org') >= 0:
+
+                                match = PNAS_DOI_RE.search(self.fullText)
+                                doiID = match.group(1)
+
+                                # may have DCSuppoemental
+                                try:
+                                        if self.fullText.find('DCSupplemental') >= 0:
+                                                doiID = match.group(2)
+                                except:
+                                        pass
+
+                                # PNAS DOI sometimes have missing '/' so can't be found using DOI_RE
+                                # determine if missing '/' OR intervening SINGLE non-alphnumeric char
+                                # if no '/' 
+                                if doiID.find('/') == -1:
+                                        if doiID.find('pnas') == 7: # there is no '/', add one
+                                                doiID = doiID.replace('10.1073', '10.1073/')
+                                        elif doiID.find('pnas') == 8: # there is a single intervening char
+                                                charToReplace = doiID[7]
+                                                doiID = doiID.replace(charToReplace, '/')
+                                return doiID
+
+                        # all else
+                        else:
+                                self.fullText = self.fullText.replace(' journal.pone', 'journal.pone')
+                                match = DOI_RE.search(self.fullText)
+
+                        if match:
+                                doiID = match.group(1)
+                                slash = doiID.find('/')
+                                nl = doiID.find('\n')
+
+                                # special case for PLoS journals, which often have a line break in the ID.
+                                # PLOS journals have 28-character DOI IDs 99.98% of the time.  Out of 10,000+
+                                # PLOS DOI IDs in MGI so far, the only others are single IDs with 21 and 24
+                                # characters.  So if we encounter a newline within the first 21 characters,
+                                # we can just remove it.
+                                # Also as of new pdftotext util we started using in Oct 2019, the 1st or 2nd
+                                #  ID occurrance in the paper may be truncated when a space is inserted
+                                #  instead of a line break. So try looking for a couple ID instances.
+
+                                if doiID.startswith('10.1371/'):
+                                        if (0 <= nl < 21):	# remove potential nl
+                                                doiID = doiID.replace('\n', '', 1)
+                                                slash = doiID.find('/')
+                                                nl = doiID.find('\n')
+                                        i = 0
+                                        while len(doiID) < 28:		# try another occurrance
+                                                if i == 3: break	# quit after 3 tries
+                                                i += 1
+
+                                                match = DOI_RE.search(self.fullText, match.end())
+                                                if not match: break	# odd, this shouldn't happen, bail
+                                                doiID = match.group(1)
+                                                slash = doiID.find('/')
+                                                nl = doiID.find('\n')
+
+                                                if (0 <= nl < 21):	# remove potential nl
+                                                        doiID = doiID.replace('\n', '', 1)
+                                                        slash = doiID.find('/')
+                                                        nl = doiID.find('\n')
+
+                                # special case for Molecular and Cellular Biology journal, which has DOI IDs
+                                # from 20 to 32 characters -- but which are often interrupted by line breaks
+                                # in their new (circa late-2016) PDF format.  As a workaround for the most
+                                # common case, remove any newlines within the first 20 characters of the ID.
+                                
+                                if doiID.startswith('10.1128/'):
+                                        while 0 <= nl < 20:
+                                                doiID = doiID.replace('\n', '', 1)
+                                                nl = doiID.find('\n')
+                                
+                                # if there is a newline right after the slash,
+                                # just remove it
+
+                                if (nl >= 0) and (nl == (slash+1)):
+                                        doiID = doiID.replace('\n', '', 1)
+                                        nl = doiID.find('\n')
+
+                                # if there is a newline later in the str.
+                                # trim the ID at that point
+
+                                if (nl >= 0) and (nl > slash):
+                                        doiID = doiID[:nl]
+
+                                # strip off trailing parentheses, periods, 
+                                # brackets, and whitespace
+                                doiID = re.sub('[\)\.\]\s]+$', '', doiID)
+
+                                # eLife IDs often errantly end with .001
+                                if (doiID.find('/eLife') > 0) and (doiID.endswith('.001')):
+                                        doiID = doiID[:-4]
+
+                                # if this is a Blood DOI ID, 
+                                # the hypenation sometimes needs tweaking
+                                # may contain a '.' or a ' '
+                                if doiID.startswith('10.1182/blood'):
+                                        match = BLOOD_DOI_RE.search(self.fullText)
+                                        doiID = match.group(0)
+                                        numbers = match.group(1)
+                                        revised = hyphenate(numbers)
+                                        doiID = doiID.replace(numbers, revised)
+                                        doiID = doiID.replace(' ', '')
+                                        doiID = doiID.replace('\n', '')
+
+                                if doiID.startswith('10.1172/jci'):
+                                        match = JCI_DOI_RE.search(self.fullText)
+                                        doiID = match.group(0)
+                                        doiID = doiID.replace(' ', '')
+
+                                if doiID.startswith('10.1530/REP'):
+                                        match = REP_DOI_RE.search(self.fullText)
+                                        doiID = match.group(0)
+                                        doiID = doiID.replace('doi.org/', '')
+                                        doiID = doiID.replace(' ', '')
+
+                                # if this is a 10.1177/...Journal DOI ID, 
+                                # then remove the trailing 'Journal' text
+                                match = JOURNAL_DOI_RE.match(doiID)
+                                if match:
+                                        doiID = doiID.replace('Journal', '')
+
+                                # if this is a Science DOI ID, we instead need
+                                # to find and return the last DOI ID for the
+                                # PDF file.
+                                if doiID.startswith('10.1126/science') or \
+                                        doiID.startswith('10.1126/scisignal'):
+                                        doiID =  self._getScienceID()
+
+                                return doiID
+                return None
+
+        def _getScienceID (self):
+                # Science journals include the end of the prior article at the
+                # start of the PDF file.  This means that we will usually
+                # return an inaccurate DOI ID for PDFs from Science journals.
+                # Instead, the desired ID occurs at the end of the article,
+                # shortly after the word "accepted".  Use these criteria to
+                # get the desired ID and return it.
+
+                # To get to this method, we must have already loaded the
+                # full text, and it must have been non-null.
+
+                # Find all occurrences of the word 'accepted' and note the
+                # position of each.  (It is possible that 'accepted' would
+                # occur in the start of the next article, so we can't just
+                # blindly take the last one.)
+
+                acceptedPositions = []
+                match = ACCEPTED_RE.search(self.fullText)
+                while match:
+                        pos = match.regs[0][0]
+                        acceptedPositions.append(pos)
+                        match = ACCEPTED_RE.search(self.fullText, pos + 1)
+
+                # Now start at the last occurrence of "accepted" and see if
+                # we can find a Science DOI ID reasonably soon after it.  If
+                # so, that's our desired ID to return.  If not, work back
+                # through the other instances of "accepted".
+
+                # how close is close enough? (number of characters)
+                threshold = 80
+                acceptedPositions.reverse()
+
+                for accPos in acceptedPositions:
+                        match = SCIENCE_DOI_RE.search(self.fullText, accPos)
+                        if match:
+                                if (match.regs[0][0] <= (accPos + threshold)):
+                                        return match.group(1)
+                return None 
+
+        def getText (self):
+                # Purpose: return the full text extracted from the PDF file
+                # Returns: str.(full text)
+
+                self._loadFullText()
+                if self.fullText:
+                        return self.fullText
+                return None
diff --git a/Pdfpath.py b/Pdfpath.py
index 2eadd12..2c1bc6e 100755
--- a/Pdfpath.py
+++ b/Pdfpath.py
@@ -1,4 +1,3 @@
-'''
 #
 # Pdfpath.py
 #
@@ -17,7 +16,6 @@
 #	- TR12250/Lit Triage
 # 04/04/2019	jak
 #	- TR12763
-'''
 
 import sys
 import os
@@ -43,31 +41,30 @@ def getPdfpath(parentpath, mgiID):
     #print getPdfpath('/data/littriage', 'MGI:')
     #print ''
 
-    print 'MGI:1'
-    print getPdfpath('/data/littriage', 'MGI:1')
-    print ''
+    print ('MGI:1')
+    print (getPdfpath('/data/littriage', 'MGI:1'))
+    print ('')
 
-    print 'MGI:11'
-    print getPdfpath('/data/littriage', 'MGI:11')
-    print ''
+    print ('MGI:11')
+    print (getPdfpath('/data/littriage', 'MGI:11'))
+    print ('')
 
-    print 'MGI:111'
-    print getPdfpath('/data/littriage', 'MGI:111')
-    print ''
+    print ('MGI:111')
+    print (getPdfpath('/data/littriage', 'MGI:111'))
+    print ('')
 
-    print 'MGI:1111'
-    print getPdfpath('/data/littriage/', 'MGI:1111')
-    print ''
+    print ('MGI:1111')
+    print (getPdfpath('/data/littriage/', 'MGI:1111'))
+    print ('')
 
-    print 'MGI:11111'
-    print getPdfpath('/data/littriage/', 'MGI:11111')
-    print ''
+    print ('MGI:11111')
+    print (getPdfpath('/data/littriage/', 'MGI:11111'))
+    print ('')
 
-    print 'MGI:111111'
-    print getPdfpath('/data/littriage/', 'MGI:111111')
-    print ''
-
-    print 'MGI:1111111'
-    print getPdfpath('/data/littriage/', 'MGI:1111111')
-    print ''
+    print ('MGI:111111')
+    print (getPdfpath('/data/littriage/', 'MGI:111111'))
+    print ('')
 
+    print ('MGI:1111111')
+    print (getPdfpath('/data/littriage/', 'MGI:1111111'))
+    print ('')
diff --git a/PubMedAgent.py b/PubMedAgent.py
index f5e99a0..f7a7568 100755
--- a/PubMedAgent.py
+++ b/PubMedAgent.py
@@ -1,4 +1,3 @@
-# Name: PubMedAgent.py
 # Purpose: to provide an easy means to fetch reference data from PubMed in
 #	a variety of formats
 # Usage:
@@ -10,7 +9,7 @@
 #	back data in your desired format using getReference(doiID) or getReferences(doiList)
 
 import string
-import urllib
+import urllib.request, urllib.parse, urllib.error
 import csv
 import xml.dom.minidom 
 import os
@@ -74,188 +73,188 @@ class PubMedReference:
     #	error message is then accessible from getErrorMessage().
 
     def __init__ (self, errorMessage = None):
-	self.pubMedID = None
-	self.doiID = None
-	self.title = None
-	self.authors = None
-	self.journal = None
-	self.date = None
-	self.year = None
-	self.issue = None
-	self.pages = None
-	self.abstract = None
-	self.volume = None
-	self.primaryAuthor = None
-	self.publicationType = None
-	# add other fields as needed
-
-	self.errorMessage = errorMessage
-
-	return
+        self.pubMedID = None
+        self.doiID = None
+        self.title = None
+        self.authors = None
+        self.journal = None
+        self.date = None
+        self.year = None
+        self.issue = None
+        self.pages = None
+        self.abstract = None
+        self.volume = None
+        self.primaryAuthor = None
+        self.publicationType = None
+        # add other fields as needed
+
+        self.errorMessage = errorMessage
+
+        return
 
     ###--- setter/getter methods ---###
 
     def isValid(self):
-	return self.errorMessage == None
+        return self.errorMessage == None
 
     def getErrorMessage(self):
-	return self.errorMessage
+        return self.errorMessage
     def setPubMedID(self, pmID):
-	self.pubMedID = pmID
+        self.pubMedID = pmID
     def getPubMedID(self):
-	return self.pubMedID
+        return self.pubMedID
     def setDoiID(self, doiID):
-	self.doiID = doiID
+        self.doiID = doiID
     def getDoiID(self):
-	return self.doiID
+        return self.doiID
     def setTitle(self, title):
-	self.title = title
+        self.title = title
     def getTitle(self):
-	return self.title
+        return self.title
     def setAuthors(self, authors):
-	self.authors = authors
+        self.authors = authors
     def getAuthors(self):
-	return self.authors
+        return self.authors
     def setJournal(self, journal):
-	self.journal = journal
+        self.journal = journal
     def getJournal(self):
-	return self.journal
+        return self.journal
     def setDate(self, date):
-	self.date = date
+        self.date = date
     def getDate(self):
-	return self.date
+        return self.date
     def setYear(self, year):
-	self.year = year
+        self.year = year
     def getYear(self):
-	return self.year
+        return self.year
     def setIssue(self, issue):
-	self.issue = issue
+        self.issue = issue
     def getIssue(self):
-	return self.issue
+        return self.issue
     def setPages(self, pages):
-	self.pages = pages
+        self.pages = pages
     def getPages(self):
-	return self.pages
+        return self.pages
     def setAbstract(self, abstract):
-	self.abstract = abstract
+        self.abstract = abstract
     def getAbstract(self):
-	return self.abstract
+        return self.abstract
     def setVolume(self, volume):
-	self.volume = volume
+        self.volume = volume
     def getVolume(self):
-	return self.volume
+        return self.volume
     def setPrimaryAuthor(self, pAuthor):
-	self.primaryAuthor = pAuthor
+        self.primaryAuthor = pAuthor
     def getPrimaryAuthor(self):
-	return self.primaryAuthor
+        return self.primaryAuthor
     def setPublicationType(self, publicationType):
-	self.publicationType = publicationType
+        self.publicationType = publicationType
     def getPublicationType(self):
-	return self.publicationType
+        return self.publicationType
     # add other accessors as needed
 
 class PubMedAgent:
-	# Is: an agent that interacts with PubMed to get reference data
-	#	for DOI IDs
-	# Does: takes DOI IDs, queries PubMed, and returns PubMedReference
-	#	objects for them
-
-	def __init__ (self):
-	    # Purpose: constructor
-	    return
-
-	def getPubMedID (self, doiID):
-		# Purpose: return the PubMed ID corresponding to this doiID, or None
-		#     if there is no corresponding PubMed ID
-		# Throws: Exception if the URL returns an error
-		# Notes: 6/30 - not tested
-
-		return self.getPubMedIDs([doiID])[doiID]
-
-	def getPubMedIDs (self, doiList):
-	    # Purpose: return a dictionary mapping from each DOI ID to its
-	    #     corresponding PubMed ID.  If no PubMed ID for a given DOI ID,
-	    #     then that one maps to None.
-	    # Throws: Exception if the URL returns an error
-	    mapping = {}  # {doiid: [pubMedId(s)], ...}
-	    try:
-		#print '### Getting PubMed IDs ###\n'
-		for doiID in doiList:
-		    forUrl = doiID
-		    forUrl = doiID.replace('(', '*')
-		    forUrl = doiID.replace(')', '*')
-		    forUrl = doiID.replace(';', '*')
-		    forUrl = doiID.replace(':', '*')
-		    response = urllib.urlopen(ID_CONVERTER_URL % (XML, forUrl))
-		    record = string.strip(response.read())
-		    xmldoc = xml.dom.minidom.parseString(record)
-		    pubmedIDs = xmldoc.getElementsByTagName("Id")
-		    #print '*****\n\n'
-		    #print ID_CONVERTER_URL % (XML, doiID)
-		    #print record
-		    #print 'pubmedIDs : ', str(pubmedIDs)
-		    #print 'doiID : ', doiID
-		    if doiID not in mapping:
-			mapping[doiID] = []
-		    if pubmedIDs == []:
-			mapping[doiID].append(None)
-		    else:
-			for pmID in pubmedIDs:
-			    #print 'pm: %s' % pmID.firstChild.data
-			    mapping[doiID].append(pmID.firstChild.data)
-	    except IOError, e:
-		if hasattr(e, 'code'): # HTTPError
-		    print 'HTTP error code: ', e.code
-		    raise Exception('HTTP error code: %s' % e.code)
-		elif hasattr(e, 'reason'): # URLError
-		    print "Can't connect, reason: ", e.reason
-		    raise Exception("Can't connect, reason: %s" % e.reason)
-		else:
-			raise Exception('Unknown exception: %s' % e)
-
-	    return mapping
-
-	def getReferenceInfo(self, doiList):
-		# Purpose: stub to be implemented by child
-		return
-	
-	def getReference (self, doiID):
-	    # Purpose: returns a dictionary that maps each DOI ID to its
+        # Is: an agent that interacts with PubMed to get reference data
+        #	for DOI IDs
+        # Does: takes DOI IDs, queries PubMed, and returns PubMedReference
+        #	objects for them
+
+        def __init__ (self):
+            # Purpose: constructor
+            return
+
+        def getPubMedID (self, doiID):
+                # Purpose: return the PubMed ID corresponding to this doiID, or None
+                #     if there is no corresponding PubMed ID
+                # Throws: Exception if the URL returns an error
+                # Notes: 6/30 - not tested
+
+                return self.getPubMedIDs([doiID])[doiID]
+
+        def getPubMedIDs (self, doiList):
+            # Purpose: return a dictionary mapping from each DOI ID to its
+            #     corresponding PubMed ID.  If no PubMed ID for a given DOI ID,
+            #     then that one maps to None.
+            # Throws: Exception if the URL returns an error
+            mapping = {}  # {doiid: [pubMedId(s)], ...}
+            try:
+                #print '### Getting PubMed IDs ###\n'
+                for doiID in doiList:
+                    forUrl = doiID
+                    forUrl = doiID.replace('(', '*')
+                    forUrl = doiID.replace(')', '*')
+                    forUrl = doiID.replace(';', '*')
+                    forUrl = doiID.replace(':', '*')
+                    response = urllib.request.urlopen(ID_CONVERTER_URL % (XML, forUrl))
+                    record = str.strip(response.read())
+                    xmldoc = xml.dom.minidom.parseString(record)
+                    pubmedIDs = xmldoc.getElementsByTagName("Id")
+                    #print '*****\n\n'
+                    #print ID_CONVERTER_URL % (XML, doiID)
+                    #print record
+                    #print 'pubmedIDs : ', str(pubmedIDs)
+                    #print 'doiID : ', doiID
+                    if doiID not in mapping:
+                        mapping[doiID] = []
+                    if pubmedIDs == []:
+                        mapping[doiID].append(None)
+                    else:
+                        for pmID in pubmedIDs:
+                            #print 'pm: %s' % pmID.firstChild.data
+                            mapping[doiID].append(pmID.firstChild.data)
+            except IOError as e:
+                if hasattr(e, 'code'): # HTTPError
+                    print('HTTP error code: ', e.code)
+                    raise Exception('HTTP error code: %s' % e.code)
+                elif hasattr(e, 'reason'): # URLError
+                    print("Can't connect, reason: ", e.reason)
+                    raise Exception("Can't connect, reason: %s" % e.reason)
+                else:
+                        raise Exception('Unknown exception: %s' % e)
+
+            return mapping
+
+        def getReferenceInfo(self, doiList):
+                # Purpose: stub to be implemented by child
+                return
+        
+        def getReference (self, doiID):
+            # Purpose: returns a dictionary that maps each DOI ID to its
             #   corresponding PubMedReference object(s) (or None, if there
             #   is no reference data in PubMed for that DOI ID)
-	    # DOI ID can map to multiple PubMed 
-	    # sc - this has not been tested
-		return self.getReferences([doiID])[doiID]
-
-	def getReferences (self, doiList):
-	    # Purpose: returns a dictionary that maps each DOI ID to its
-	    #	corresponding PubMedReference object(s) (or None, if there
-	    #	is no reference data in PubMed for that DOI ID)
-	    # Notes: DOI ID can map to multiple PubMed
-
-	    # translate doiList to doiID/pubmedID dictionary
-	    # pubMedDict = {doiID:pubMedID, ...}
-	    #print 'getReferences doiList: %s' % doiList
-
-	    pubMedDict = self.getPubMedIDs(doiList)
-
-	    # call getReferenceInfo - which is implemented by the subclass.
-
-	    mapping = {}
-	    #print '### Getting PubMed References ###'
-	    for doiID in pubMedDict:
-		if doiID not in mapping:
-		    mapping[doiID] = []
-		pubMedIdList = pubMedDict[doiID]
-		refObject = None # default, for no pmID
-		#print 'pubMedIdList: %s' % pubMedIdList
-		for pubMedID in pubMedIdList:
-		    if pubMedID == None:
-			 mapping[doiID].append(refObject)
-		    else:
-			 refObject = self.getReferenceInfo(pubMedID)
-		         mapping[doiID].append(refObject)
-	    return mapping
+            # DOI ID can map to multiple PubMed 
+            # sc - this has not been tested
+                return self.getReferences([doiID])[doiID]
+
+        def getReferences (self, doiList):
+            # Purpose: returns a dictionary that maps each DOI ID to its
+            #	corresponding PubMedReference object(s) (or None, if there
+            #	is no reference data in PubMed for that DOI ID)
+            # Notes: DOI ID can map to multiple PubMed
+
+            # translate doiList to doiID/pubmedID dictionary
+            # pubMedDict = {doiID:pubMedID, ...}
+            #print 'getReferences doiList: %s' % doiList
+
+            pubMedDict = self.getPubMedIDs(doiList)
+
+            # call getReferenceInfo - which is implemented by the subclass.
+
+            mapping = {}
+            #print '### Getting PubMed References ###'
+            for doiID in pubMedDict:
+                if doiID not in mapping:
+                    mapping[doiID] = []
+                pubMedIdList = pubMedDict[doiID]
+                refObject = None # default, for no pmID
+                #print 'pubMedIdList: %s' % pubMedIdList
+                for pubMedID in pubMedIdList:
+                    if pubMedID == None:
+                         mapping[doiID].append(refObject)
+                    else:
+                         refObject = self.getReferenceInfo(pubMedID)
+                         mapping[doiID].append(refObject)
+            return mapping
     
 class PubMedAgentJson (PubMedAgent):
     # Is: an agent that interacts with PubMed to get reference data
@@ -264,8 +263,8 @@ class PubMedAgentJson (PubMedAgent):
     #	for each reference
     # Note: Not implemented
     def __init__ (self):
-	# Purpose: constructor
-	return
+        # Purpose: constructor
+        return
 
     # override method used to format each reference, reporting JSON
     # for this class
@@ -274,139 +273,139 @@ class PubMedAgentMedline (PubMedAgent):
     # Is: an agent that interacts with PubMed to get reference data
     #	for DOI IDs
     # Does: takes DOI IDs, queries PubMed, and returns a Medline-formatted
-    #	string for each reference
+    #	str.for each reference
 
     def __init__ (self):
-	return
+        return
 
     # override method used to format each reference, reporting Medline
     # format for the PubMed request
 
     def getReferenceInfo(self, pubMedID):
-	# Purpose: Implementation of the superclass stub. Given a pubMedID, get a
-	#   MedLine record, parse, create and return a PubMedReference object
-	# Throws: Exception if the URL returns an error
-	# Init the reference we will return
-	pubMedRef = None
-	try:
-	    #print REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE)
-	    response = urllib.urlopen(REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE))
-	    medLineRecord = string.strip(response.read())
-	    #print '"%s"' % medLineRecord
-	except IOError, e:
-	    if hasattr(e, 'code'): # HTTPError
-		print 'http error code: ', e.code
-		raise Exception('HTTP error code: %s' % e.code)
-	    elif hasattr(e, 'reason'): # URLError
-		print "Can't connect, reason: ", e.reason
-		raise Exception("Can't connect, reason: %s" % e.reason)
-	    else:
-		raise Exception('Unknown exception: %s' % e)
-
-	# if this pubMedID returns an error, create reference object with
-	# that error message, otherwise parse the record
-	if string.find(medLineRecord, 'Error occurred:') !=  -1:
-	    pubMedRef = PubMedReference(errorMessage = medLineRecord)
-	else: 
-	    pubMedRef = PubMedReference()
-	    tokens = string.split(medLineRecord, '\n')
-
-	    # Abstract, multilined w/o additional tag
-	    isAB = 0
-	    abList = []
-
-	    # author, multilined each with tag
-	    auList = []
-
-	    # title, multilined w/o additional tag
-	    isTI = 0
-	    tiList = []
-
-	    # publication type
-	    isPT = 0
-
-	    for line in tokens:
-		# parse MedLine format
-
-		#print line
-
-		if isTI == 1:
-		    if line.startswith('      '):
-		        tiList.append(string.strip(line))
-		        continue
+        # Purpose: Implementation of the superclass stub. Given a pubMedID, get a
+        #   MedLine record, parse, create and return a PubMedReference object
+        # Throws: Exception if the URL returns an error
+        # Init the reference we will return
+        pubMedRef = None
+        try:
+            #print REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE)
+            response = urllib.request.urlopen(REFERENCE_FETCH_URL % (pubMedID, TEXT, MEDLINE))
+            medLineRecord = str.strip(response.read())
+            #print '"%s"' % medLineRecord
+        except IOError as e:
+            if hasattr(e, 'code'): # HTTPError
+                print('http error code: ', e.code)
+                raise Exception('HTTP error code: %s' % e.code)
+            elif hasattr(e, 'reason'): # URLError
+                print("Can't connect, reason: ", e.reason)
+                raise Exception("Can't connect, reason: %s" % e.reason)
+            else:
+                raise Exception('Unknown exception: %s' % e)
+
+        # if this pubMedID returns an error, create reference object with
+        # that error message, otherwise parse the record
+        if str.find(medLineRecord, 'Error occurred:') !=  -1:
+            pubMedRef = PubMedReference(errorMessage = medLineRecord)
+        else: 
+            pubMedRef = PubMedReference()
+            tokens = str.split(medLineRecord, '\n')
+
+            # Abstract, multilined w/o additional tag
+            isAB = 0
+            abList = []
+
+            # author, multilined each with tag
+            auList = []
+
+            # title, multilined w/o additional tag
+            isTI = 0
+            tiList = []
+
+            # publication type
+            isPT = 0
+
+            for line in tokens:
+                # parse MedLine format
+
+                #print line
+
+                if isTI == 1:
+                    if line.startswith('      '):
+                        tiList.append(str.strip(line))
+                        continue
                     else:
-		        isTI = 0
+                        isTI = 0
 
-		if isAB == 1:
-		    if line.startswith('      '):
-		        abList.append(string.strip(line))
-		        continue
+                if isAB == 1:
+                    if line.startswith('      '):
+                        abList.append(str.strip(line))
+                        continue
                     else:
-		        isAB = 0
+                        isAB = 0
 
-		# strip by first '-'
-		try:
-		    value = (map(string.strip,string.split(line, '-', 1)))[1]
-		# else use entire line
+                # strip by first '-'
+                try:
+                    value = (list(map(str.strip,str.split(line, '-', 1))))[1]
+                # else use entire line
                 except:
-		    value = string.strip(line)
-
-		# tags of interest
-		if line.startswith('PMID'):
-		    pubMedRef.setPubMedID(value) 
-
-		elif line.startswith('TI'):
-		    isTI = 1
-		    tiList.append(value)
-
-		# skip 'AUID-'
-		elif line.startswith('AU  -'):
-		    if auList == []:
-			pubMedRef.setPrimaryAuthor(value)
-		    auList.append(value)
-
-		elif line.startswith('TA'):
-		    pubMedRef.setJournal(value)
-
-		elif line.startswith('DP'):
-		    pubMedRef.setDate(value)
-		    #print 'setting date in reference from: %s' % value
-		    pubMedRef.setYear(string.split(value, ' ', 1)[0])
-
-		elif line.startswith('IP'):
-		    pubMedRef.setIssue(value)
-
-		elif line.startswith('PG'):
-		    pubMedRef.setPages(value)
-
-		elif line.startswith('AB'):
-		    isAB = 1
-		    abList.append(value)
-
-		elif line.startswith('VI'):
-		    pubMedRef.setVolume(value)
-
-		elif line.startswith('AID') and (string.find(line, '[doi]') > 0):
-		    pubMedRef.setDoiID(string.strip(string.split(line, 'AID -')[1].split('[')[0]))
-
-		elif line.startswith('PT'):
-
-		    # find last PT or use list
-		    if isPT == 0:
-		        if value == 'Review':
-		            pubMedRef.setPublicationType(value)
-		     	    isPT = 1
-		        elif value == 'Editorial':
-		            pubMedRef.setPublicationType(value)
-		     	    isPT = 1
-		        elif value == 'Comment':
-		            pubMedRef.setPublicationType(value)
-		     	    isPT = 1
-		        else:
-		            pubMedRef.setPublicationType(value)
-
-	    pubMedRef.setAbstract(string.join(abList))
-	    pubMedRef.setAuthors(string.join(auList, '; '))
-	    pubMedRef.setTitle(string.join(tiList))
-
-	return pubMedRef
+                    value = str.strip(line)
+
+                # tags of interest
+                if line.startswith('PMID'):
+                    pubMedRef.setPubMedID(value) 
+
+                elif line.startswith('TI'):
+                    isTI = 1
+                    tiList.append(value)
+
+                # skip 'AUID-'
+                elif line.startswith('AU  -'):
+                    if auList == []:
+                        pubMedRef.setPrimaryAuthor(value)
+                    auList.append(value)
+
+                elif line.startswith('TA'):
+                    pubMedRef.setJournal(value)
+
+                elif line.startswith('DP'):
+                    pubMedRef.setDate(value)
+                    #print 'setting date in reference from: %s' % value
+                    pubMedRef.setYear(str.split(value, ' ', 1)[0])
+
+                elif line.startswith('IP'):
+                    pubMedRef.setIssue(value)
+
+                elif line.startswith('PG'):
+                    pubMedRef.setPages(value)
+
+                elif line.startswith('AB'):
+                    isAB = 1
+                    abList.append(value)
+
+                elif line.startswith('VI'):
+                    pubMedRef.setVolume(value)
+
+                elif line.startswith('AID') and (str.find(line, '[doi]') > 0):
+                    pubMedRef.setDoiID(str.strip(str.split(line, 'AID -')[1].split('[')[0]))
+
+                elif line.startswith('PT'):
+
+                    # find last PT or use list
+                    if isPT == 0:
+                        if value == 'Review':
+                            pubMedRef.setPublicationType(value)
+                            isPT = 1
+                        elif value == 'Editorial':
+                            pubMedRef.setPublicationType(value)
+                            isPT = 1
+                        elif value == 'Comment':
+                            pubMedRef.setPublicationType(value)
+                            isPT = 1
+                        else:
+                            pubMedRef.setPublicationType(value)
+
+            pubMedRef.setAbstract(str.join(abList))
+            pubMedRef.setAuthors(str.join(auList, '; '))
+            pubMedRef.setTitle(str.join(tiList))
+
+        return pubMedRef
diff --git a/PubMedCentralAgent.py b/PubMedCentralAgent.py
index 6ec771c..41b3cf6 100755
--- a/PubMedCentralAgent.py
+++ b/PubMedCentralAgent.py
@@ -1,4 +1,3 @@
-# Name: PubMedCentralAgent.py
 # Purpose: provide an interface to various services at PubMed Central
 # Usage: 
 #	1. Initialize the module by calling setToolName() and/or setEmailAddress() as desired to override
@@ -7,7 +6,7 @@
 #	PMC IDs and look up)
 #	3. Run with it.
 
-import urllib2
+import urllib.request, urllib.error, urllib.parse
 import xml.dom.minidom 
 import HttpRequestGovernor
 
@@ -68,7 +67,7 @@ def __init__ (self):
     
     def getPMCID (self, doiID):
         # Purpose: look up the PMC ID for a single DOI ID
-        # Returns: string (PMC ID) or None (if the DOI ID has no PMC ID)
+        # Returns: str.(PMC ID) or None (if the DOI ID has no PMC ID)
         # Throws: Exception if there are problems communicating with PubMed Central
         
         return self.getPMCIDs([ doiID ])[doiID]
@@ -84,14 +83,14 @@ def getPMCIDs (self, doiIDs):
             return pmcIDs
 
         # strip leading & trailing spaces from IDs and split the list into chunks
-        sublists = _splitList(map(lambda x : x.strip(), doiIDs), 20)
+        sublists = _splitList([x.strip() for x in doiIDs], 20)
 
         for sublist in sublists:
             lines = HttpRequestGovernor.readURL(ID_CONVERTER_URL % (TOOL_NAME, EMAIL_ADDRESS, ','.join(sublist)))
             
             # Lines have comma-delimited columns.  String values are in double-quotes.
             # Standardize lines by stripping out the double-quotes, then splitting on commas.
-            lines = map(lambda x: x.split(','), lines.replace('"', '').split('\n'))
+            lines = [x.split(',') for x in lines.replace('"', '').split('\n')]
             
             # first line will have column headers.  We need DOI and PMCID columns.
             if 'DOI' not in lines[0]:
@@ -119,7 +118,7 @@ def __init__ (self):
     
     def getUrl (self, pmcID):
         # Purpose: look up the download URL for a single PMC ID
-        # Returns: string (URL) or None (if the PMC ID has no file to download)
+        # Returns: str.(URL) or None (if the PMC ID has no file to download)
         # Throws: Exception if there are problems communicating with PubMed Central
         
         return self.getUrls([ pmcID ])[pmcID]
@@ -136,7 +135,7 @@ def getUrls (self, pmcIDs):
         if not pmcIDs:
             return urls
 
-        for pmcID in map(lambda x: x.strip(), pmcIDs):
+        for pmcID in [x.strip() for x in pmcIDs]:
             lines = HttpRequestGovernor.readURL(PDF_LOOKUP_URL % pmcID)
             xmldoc = xml.dom.minidom.parseString(lines)
 
@@ -154,4 +153,4 @@ def getUrls (self, pmcIDs):
             else:
                 urls[pmcID] = None
         
-        return urls
\ No newline at end of file
+        return urls
diff --git a/extractedTextSplitter.py b/extractedTextSplitter.py
index 1f80482..6dca9ef 100755
--- a/extractedTextSplitter.py
+++ b/extractedTextSplitter.py
@@ -1,4 +1,3 @@
-#!/usr/local/bin/python
 #!/usr/bin/env python2.4
 #!/usr/bin/env python
 
@@ -20,13 +19,13 @@
     body		- everything up to the next section
     references		- the reference section
     manuscript figures	- some manuscript PDFs have figures/tables after
-    			   the refs
-			  (WE USE THE TERM "figure" to mean "figure/table" )
+                           the refs
+                          (WE USE THE TERM "figure" to mean "figure/table" )
     star*methods	- some papers have an extended "methods" section
-    			   after the refs. This section is called "Star*Methods"
+                           after the refs. This section is called "Star*Methods"
     supplemental data	- indicated by a special MGI text tag inserted by
-    			   Nancy or someone in MGI when the supp data is added
-			   to the PDF
+                           Nancy or someone in MGI when the supp data is added
+                           to the PDF
 
 The end of each section is the beginning of the next section.
 
@@ -46,17 +45,17 @@
 
     splitter = ExtTextSplitter()
 
-    # To just get the actual sections' string:
+    # To just get the actual sections' str.
 
     (body, refs, manuFigures, starMethods, suppData) = \
-						splitter.splitSections(text)
+                                                splitter.splitSections(text)
 
     # To get Section objects that contain more info about each predicted section
     # (Section Objects contain: type, text, sPos, ePos, and a "reason" the
     #   splitting algorithm made its prediction - see Section Class below):
 
     (bodyS, refsS, manuFiguresS, starMethodsS, suppDataS) = \
-						splitter.findSections(text)
+                                                splitter.findSections(text)
 ###################
 Overview of the Splitting Algorithm:
 ###################
@@ -67,8 +66,8 @@
 If Refs start is found, search for Manuscript figures: search foward from
     there for a figure legend occurring before the start of Star Methods
     NOTE this will match any figure/table legend after the start of the refs,
-	even if they are not in an official "manuscript" section.
-	But ending the refs section at any legend is good
+        even if they are not in an official "manuscript" section.
+        But ending the refs section at any legend is good
 Body = start of text up to References start
 
 There are also minFraction (for all sections) and maxFraction (for references
@@ -82,10 +81,10 @@
     which may == end of extracted text.
 
 Algorithm approach:
-You cannot search backward in a string with a regex.
+You cannot search backward in a str.with a regex.
 Instead, we put all the different section heading patterns into one honking
 regex that we match (forward) against the extText.
-(so we only make one pass through the string)
+(so we only make one pass through the str.
 Then we can scan the list of all regex matches backward (or forward) as neededj.
 See class TypedRegexMatcher
 
@@ -96,9 +95,9 @@
 class Section		- describes a section
 class ExtTextSplitter	- implements the splitting algorithm
 class TypedRegexMatcher	- takes regex's grouped by user defined "type",
-			    combines them, lets you match against a string,
-			    and gives you back lists of TypedMatch objects
-			    that represent the matches found
+                            combines them, lets you match against a str.
+                            and gives you back lists of TypedMatch objects
+                            that represent the matches found
 class TypedMatch	- describes a match from TypedRegexMatcher
 """
 import string
@@ -108,12 +107,12 @@ class TypedMatch	- describes a match from TypedRegexMatcher
 #  Regex building functions
 # ----------------------------------
 def spacedOutRegex(s):
-    # for given string, return regex pattern string that matches the chars
-    #  in the string with optional spaces between the chars.
+    # for given str. return regex pattern str.that matches the chars
+    #  in the str.with optional spaces between the chars.
     # Useful because sometimes the PDF to text extraction inserts spaces.
     reg = []
     for c in s:
-	reg.append('[%s]' % c)
+        reg.append('[%s]' % c)
     return '[ ]*'.join(reg)
 
 #-----------------------------------------------
@@ -133,42 +132,42 @@ class Section (object):
     IS an object that describes a section of an article (from extracted text)
     """
     def __init__(self, secType, text='', reason='', sPos=None, ePos=None):
-	self.secType = secType	# section name. see vocab above
-	self.text    = text	# the text of the section
-	self.reason  = reason	# reason this section start was chosen
-				#  typically the string that we matched
-				#  for section header.
-	self.sPos    = sPos	# start position within the article text
-	self.ePos    = ePos	# end pos - index of 1st char not in section
+        self.secType = secType	# section name. see vocab above
+        self.text    = text	# the text of the section
+        self.reason  = reason	# reason this section start was chosen
+                                #  typically the str.that we matched
+                                #  for section header.
+        self.sPos    = sPos	# start position within the article text
+        self.ePos    = ePos	# end pos - index of 1st char not in section
 
     def __str__(self):
-	return "Section object: %s reason: '%s' %d %d\n'%s'\n" %  \
-	    (self.secType, self.reason, self.sPos, self.ePos, self.text[:40])
+        return "Section object: %s reason: '%s' %d %d\n'%s'\n" %  \
+            (self.secType, self.reason, self.sPos, self.ePos, self.text[:40])
 #------------------ end Class SectionBoundary
 
 class ExtTextSplitter (object): #{
     '''
     Is: a class that knows how to split extracted text for a PDF
-	into multiple parts:
-	    body, references, manuscript figs, "star methods", supp data
-	    (any of these sections except body can be the empty string)
-	    "Star Methods" is a methods section that some journals put after
-	    the references and before any supplemental data.
+        into multiple parts:
+            body, references, manuscript figs, "star methods", supp data
+            (any of these sections except body can be the empty str.
+            "Star Methods" is a methods section that some journals put after
+            the references and before any supplemental data.
 
     Has: floats: minFraction, maxFraction and a TypedRegexMatcher
 
-	The min/maxFractions are used ... JIM: doc this better
-	once any supp data and "star methods" section are removed from the end
-	of the text.
-	If the length of the predicted reference section is
-	    > maxFraction of the total text length from the end
-	    or
-	    < minFraction from the end
-	then the prediction is considered invalid, and the reference section
-	is set to '', and the body is not split.
-
-    Does: splitSections() - get the section strings
-	   findSections() - get descriptions of the section
+        The min/maxFractions are used ... JIM: doc this better
+        once any supp data and "star methods" section are removed from the end
+        of the text.
+        If the length of the predicted reference section is
+            > maxFraction of the total text length from the end
+            or
+            < minFraction from the end
+        then the prediction is considered invalid, and the reference section
+        is set to '', and the body is not split.
+
+    Does: splitSections() - get the section str.
+           findSections() - get descriptions of the section
     '''
 
     # Names of the match/regex types
@@ -182,46 +181,46 @@ class ExtTextSplitter (object): #{
     # Different journals/articles may have words before "Figure":
     #   any single word, "\w+",  or the specific word combos below
     OPT_FIG_START = r'(?:(?:\w+'                                  + \
-		    '|' + spacedOutRegex('supp data')          + \
-		    '|' + spacedOutRegex('supplemental data')  + \
-		    '|' + spacedOutRegex('supplementary data') + \
-		    '|' + spacedOutRegex('extended data')      + \
-		    r') )?'
+                    '|' + spacedOutRegex('supp data')          + \
+                    '|' + spacedOutRegex('supplemental data')  + \
+                    '|' + spacedOutRegex('supplementary data') + \
+                    '|' + spacedOutRegex('extended data')      + \
+                    r') )?'
 
     # Dict defining all the section start tags and their match types
     # End each w/ \n or \b to force line or word boundaries
     # The startPattern on TypedRegexMatcher constructor sets '\n' for line start
     regexDict = {
-	REF_SECTION_PRIMARY  : [spacedOutRegex("References")            + '\n',
-				spacedOutRegex("Literature Cited")      + '\n',
-				spacedOutRegex("References and Notes")  + '\n',
-				],
-	REF_SECTION_SECONDARY: [spacedOutRegex("Reference")             + '\n',
-				spacedOutRegex("Acknowledgements")      + r'\b',
-				spacedOutRegex("Acknowledgments")       + r'\b',
-				spacedOutRegex("Conflicts of Interest") + r'\b',
-				spacedOutRegex("Conflict of Interest")  + r'\b',
-				],
-	MANUSCRIPT_FIGURES   : [OPT_FIG_START + spacedOutRegex("Figure")+ r'\b',
-				OPT_FIG_START + spacedOutRegex("Fig")   + r'\b',
-				OPT_FIG_START + spacedOutRegex("Table") + r'\b',
-				],
-	STAR_METHODS         : [spacedOutRegex("Star") + "[ ]*[ *+][ ]*" +
-					    spacedOutRegex("Methods") + '\n',
-				],
-	SUPP_DATA            : [spacedOutRegex(SUPP_DATA_TAG) + '\n',
-				],
-		}
+        REF_SECTION_PRIMARY  : [spacedOutRegex("References")            + '\n',
+                                spacedOutRegex("Literature Cited")      + '\n',
+                                spacedOutRegex("References and Notes")  + '\n',
+                                ],
+        REF_SECTION_SECONDARY: [spacedOutRegex("Reference")             + '\n',
+                                spacedOutRegex("Acknowledgements")      + r'\b',
+                                spacedOutRegex("Acknowledgments")       + r'\b',
+                                spacedOutRegex("Conflicts of Interest") + r'\b',
+                                spacedOutRegex("Conflict of Interest")  + r'\b',
+                                ],
+        MANUSCRIPT_FIGURES   : [OPT_FIG_START + spacedOutRegex("Figure")+ r'\b',
+                                OPT_FIG_START + spacedOutRegex("Fig")   + r'\b',
+                                OPT_FIG_START + spacedOutRegex("Table") + r'\b',
+                                ],
+        STAR_METHODS         : [spacedOutRegex("Star") + "[ ]*[ *+][ ]*" +
+                                            spacedOutRegex("Methods") + '\n',
+                                ],
+        SUPP_DATA            : [spacedOutRegex(SUPP_DATA_TAG) + '\n',
+                                ],
+                }
 
     def __init__(self,
-		minFraction=0.05, # min fraction predicted for ref section
-		maxFraction=0.4,  # max fraction of whole doc that the
-		 		  #  predicted ref section is allowed to be
-	):
-	self.minFraction = minFraction
-	self.maxFraction = maxFraction
-	self.matcher = TypedRegexMatcher(self.regexDict, startPattern='\n')
-	self.initSections('')
+                minFraction=0.05, # min fraction predicted for ref section
+                maxFraction=0.4,  # max fraction of whole doc that the
+                                  #  predicted ref section is allowed to be
+        ):
+        self.minFraction = minFraction
+        self.maxFraction = maxFraction
+        self.matcher = TypedRegexMatcher(self.regexDict, startPattern='\n')
+        self.initSections('')
     # ----------------------------------
 
     def getRegexMatcher(self):	return self.matcher
@@ -229,257 +228,257 @@ def getExtText(self): 	return self.extText
     # ----------------------------------
 
     def initSections(self, extText):
-	"""
-	initialize all the text sections to missing
-	"""
-	self.extText = extText
-	self.lenExtText = len(extText)
-
-	# body is whole thing for now.
-	self.bodyS = Section(SECTION_BODY, extText, "body start", 0,
-							    self.lenExtText)
-
-	# mark all other sections as missing for now
-	self.refsS = Section(SECTION_REFS,  '', 'no ref section match',
-					    self.lenExtText, self.lenExtText)
-	self.mfigS = Section(SECTION_MFIGS, '', 'no manuscript figs match',
-					    self.lenExtText, self.lenExtText)
-	self.starS = Section(SECTION_STAR,  '', 'no star methods match',
-					    self.lenExtText, self.lenExtText)
-	self.suppS = Section(SECTION_SUPP,  '', 'no supp data match',
-					    self.lenExtText, self.lenExtText)
+        """
+        initialize all the text sections to missing
+        """
+        self.extText = extText
+        self.lenExtText = len(extText)
+
+        # body is whole thing for now.
+        self.bodyS = Section(SECTION_BODY, extText, "body start", 0,
+                                                            self.lenExtText)
+
+        # mark all other sections as missing for now
+        self.refsS = Section(SECTION_REFS,  '', 'no ref section match',
+                                            self.lenExtText, self.lenExtText)
+        self.mfigS = Section(SECTION_MFIGS, '', 'no manuscript figs match',
+                                            self.lenExtText, self.lenExtText)
+        self.starS = Section(SECTION_STAR,  '', 'no star methods match',
+                                            self.lenExtText, self.lenExtText)
+        self.suppS = Section(SECTION_SUPP,  '', 'no supp data match',
+                                            self.lenExtText, self.lenExtText)
     # ----------------------------------
 
     def splitSections(self, extText):
-	"""
-	#### if you just want the text of the sections, call this ####
-	Split the exText, return the sections text
-	Return the text of the sections tuple:
-	    (body, ref section, manuscript figs, star methods, supp data)
-	"""
-	self.findSections(extText)
-	return (self.bodyS.text,
-		self.refsS.text,
-		self.mfigS.text,
-		self.starS.text,
-		self.suppS.text,
-		)
+        """
+        #### if you just want the text of the sections, call this ####
+        Split the exText, return the sections text
+        Return the text of the sections tuple:
+            (body, ref section, manuscript figs, star methods, supp data)
+        """
+        self.findSections(extText)
+        return (self.bodyS.text,
+                self.refsS.text,
+                self.mfigS.text,
+                self.starS.text,
+                self.suppS.text,
+                )
     # ----------------------------------
 
     def findSections(self, extText):
-	"""
-	#### if you want details of the sections, call this ####
-	Find the sections in text.
-	Set self.bodyS, refsS, mfigS, starS, suppS
-	    to Section objects describing each section
-	Return the 5 Section objects:
-	"""
-	self.initSections(extText)
-
-	matches = self.matcher.match(extText)
-	if len(matches) != 0:		# got some matches
-	    # The order of these calls is important
-	    self.findSuppSection()
-	    self.findStarSection()
-	    self.findRefsSection()
-	    self.findMfigSection()
-	    self.findBodySection()
-	return self.bodyS, self.refsS, self.mfigS, self.starS, self.suppS
+        """
+        #### if you want details of the sections, call this ####
+        Find the sections in text.
+        Set self.bodyS, refsS, mfigS, starS, suppS
+            to Section objects describing each section
+        Return the 5 Section objects:
+        """
+        self.initSections(extText)
+
+        matches = self.matcher.match(extText)
+        if len(matches) != 0:		# got some matches
+            # The order of these calls is important
+            self.findSuppSection()
+            self.findStarSection()
+            self.findRefsSection()
+            self.findMfigSection()
+            self.findBodySection()
+        return self.bodyS, self.refsS, self.mfigS, self.starS, self.suppS
     # ----------------------------------
 
     def findSuppSection(self):
-	"""
-	Set self.suppS
-	Assumes:
-	    self.suppS is initialized to be length 0 at end of self.extText
-	"""
-	section = self.suppS
-	matches = self.matcher.getMatches(self.SUPP_DATA)
-	if len(matches) != 0:		# matched supp data start tags
-	    m = matches[-1]		# use last match
-
-	    section.reason = m.text
-	    section.sPos   = m.sPos
-	    section.ePos   = self.lenExtText
-	    section.text   = self.extText[section.sPos : section.ePos]
-
-	# else assume self.suppS is already initialized correctly
-	return
+        """
+        Set self.suppS
+        Assumes:
+            self.suppS is initialized to be length 0 at end of self.extText
+        """
+        section = self.suppS
+        matches = self.matcher.getMatches(self.SUPP_DATA)
+        if len(matches) != 0:		# matched supp data start tags
+            m = matches[-1]		# use last match
+
+            section.reason = m.text
+            section.sPos   = m.sPos
+            section.ePos   = self.lenExtText
+            section.text   = self.extText[section.sPos : section.ePos]
+
+        # else assume self.suppS is already initialized correctly
+        return
     # ----------------------------------
 
     def findStarSection(self):
-	"""
-	Set self.starS
-	Assumes:
-	    self.suppS is set appropriately
-	    self.starS is initialized to be length 0 at end of self.extText
-	"""
-	section = self.starS
-	allMatches = self.matcher.getMatches(self.STAR_METHODS)
-
-	matches = []
-	for m in allMatches:	# collect matches before supp data start
-	    if m.sPos < self.suppS.sPos: matches.append(m)
-	    else: break
-
-	if len(matches) == 0:		# no star methods match
-	    section.sPos = self.suppS.sPos
-	    section.ePos = self.suppS.sPos
-	else:				# got a match
-	    matches.reverse()
-	    m = self.findNotTooLateMatch(matches)
-	    if m == None:			# no reasonable match
-		section.reason = 'star methods too close to end (%d)' % \
-							    matches[-1].sPos
-		section.sPos = self.suppS.sPos
-		section.ePos = self.suppS.sPos
-	    else:			# got a good one
-		section.reason = m.text
-		section.sPos   = m.sPos
-		section.ePos   = self.suppS.sPos
-
-	section.text   = self.extText[section.sPos : section.ePos]
-	return
+        """
+        Set self.starS
+        Assumes:
+            self.suppS is set appropriately
+            self.starS is initialized to be length 0 at end of self.extText
+        """
+        section = self.starS
+        allMatches = self.matcher.getMatches(self.STAR_METHODS)
+
+        matches = []
+        for m in allMatches:	# collect matches before supp data start
+            if m.sPos < self.suppS.sPos: matches.append(m)
+            else: break
+
+        if len(matches) == 0:		# no star methods match
+            section.sPos = self.suppS.sPos
+            section.ePos = self.suppS.sPos
+        else:				# got a match
+            matches.reverse()
+            m = self.findNotTooLateMatch(matches)
+            if m == None:			# no reasonable match
+                section.reason = 'star methods too close to end (%d)' % \
+                                                            matches[-1].sPos
+                section.sPos = self.suppS.sPos
+                section.ePos = self.suppS.sPos
+            else:			# got a good one
+                section.reason = m.text
+                section.sPos   = m.sPos
+                section.ePos   = self.suppS.sPos
+
+        section.text   = self.extText[section.sPos : section.ePos]
+        return
     # ----------------------------------
 
     def findRefsSection(self):
-	"""
-	Set self.refsS
-	Assumes:
-	    self.starS is set appropriately
-	    self.refsS is initialized to be length 0 at end of self.extText
-	"""
-	section   = self.refsS
-	primary   = self.matcher.getMatches(self.REF_SECTION_PRIMARY)
-	secondary = self.matcher.getMatches(self.REF_SECTION_SECONDARY)
-
-	m, primaryReason = self.findRefsMatch(primary)
-
-	if m:				# got a good primary match
-	    section.reason = primaryReason
-	    section.sPos   = m.sPos
-	    section.ePos   = self.starS.sPos
-
-	elif len(secondary) == 0:	# no good primary, and no secondary
-	    section.reason = primaryReason
-	    section.sPos   = self.starS.sPos
-	    section.ePos   = self.starS.sPos
-	else:				# no good primary, but some secondary
-	    m, secondaryReason = self.findRefsMatch(secondary)
-
-	    if m:			# got good secondary match
-		section.reason = secondaryReason
-		section.sPos   = m.sPos
-		section.ePos   = self.starS.sPos
-	    else:			# no good secondary match either
-		section.reason = primaryReason + '; \n' + secondaryReason
-		section.sPos   = self.starS.sPos
-		section.ePos   = self.starS.sPos
-
-	section.text   = self.extText[section.sPos : section.ePos]
-	return
+        """
+        Set self.refsS
+        Assumes:
+            self.starS is set appropriately
+            self.refsS is initialized to be length 0 at end of self.extText
+        """
+        section   = self.refsS
+        primary   = self.matcher.getMatches(self.REF_SECTION_PRIMARY)
+        secondary = self.matcher.getMatches(self.REF_SECTION_SECONDARY)
+
+        m, primaryReason = self.findRefsMatch(primary)
+
+        if m:				# got a good primary match
+            section.reason = primaryReason
+            section.sPos   = m.sPos
+            section.ePos   = self.starS.sPos
+
+        elif len(secondary) == 0:	# no good primary, and no secondary
+            section.reason = primaryReason
+            section.sPos   = self.starS.sPos
+            section.ePos   = self.starS.sPos
+        else:				# no good primary, but some secondary
+            m, secondaryReason = self.findRefsMatch(secondary)
+
+            if m:			# got good secondary match
+                section.reason = secondaryReason
+                section.sPos   = m.sPos
+                section.ePos   = self.starS.sPos
+            else:			# no good secondary match either
+                section.reason = primaryReason + '; \n' + secondaryReason
+                section.sPos   = self.starS.sPos
+                section.ePos   = self.starS.sPos
+
+        section.text   = self.extText[section.sPos : section.ePos]
+        return
     # ----------------------------------
 
     def findRefsMatch(self, allMatches):
-	"""
-	Find a good match in 'allMatches'.
-	Return the good match object (or None) + reason
-	Assumes: matches is sorted from start of doc to end
-	"""
-	matches = []
-	for m in allMatches:	# collect matches before star methods start
-	    if m.sPos < self.starS.sPos: matches.append(m)
-	    else: break
-
-	if len(matches) != 0:		# matched refs start tags
-	    matches.reverse()		# find last occurances first
-	    m = self.findNotTooLateMatch(matches)
-	    if m != None:		# match that is not too late
-
-		# is length of refs too big?
-		refLength = self.starS.sPos - m.sPos
-		if float(refLength)/float(self.starS.sPos) <= self.maxFraction:
-		    reason = m.text
-		else:			# refs section too big
-		    reason = "refs match is too early: '%s' (%d)" % \
-							    (m.text, m.sPos)
-		    m = None
-	    else:			# no good match
-		reason ="refs matches too close to end, earliest: '%s' (%d)" % \
-					(matches[-1].text, matches[-1].sPos)
-		m = None
-	else:				# no start tag found
-	    m = None
-	    reason = 'no refs match'
-
-	return m, reason
+        """
+        Find a good match in 'allMatches'.
+        Return the good match object (or None) + reason
+        Assumes: matches is sorted from start of doc to end
+        """
+        matches = []
+        for m in allMatches:	# collect matches before star methods start
+            if m.sPos < self.starS.sPos: matches.append(m)
+            else: break
+
+        if len(matches) != 0:		# matched refs start tags
+            matches.reverse()		# find last occurances first
+            m = self.findNotTooLateMatch(matches)
+            if m != None:		# match that is not too late
+
+                # is length of refs too big?
+                refLength = self.starS.sPos - m.sPos
+                if float(refLength)/float(self.starS.sPos) <= self.maxFraction:
+                    reason = m.text
+                else:			# refs section too big
+                    reason = "refs match is too early: '%s' (%d)" % \
+                                                            (m.text, m.sPos)
+                    m = None
+            else:			# no good match
+                reason ="refs matches too close to end, earliest: '%s' (%d)" % \
+                                        (matches[-1].text, matches[-1].sPos)
+                m = None
+        else:				# no start tag found
+            m = None
+            reason = 'no refs match'
+
+        return m, reason
     # ----------------------------------
 
     def findMfigSection(self):
-	"""
-	Look forward from refs section to star methods & see if any
-	figures/tables in between. 
-	If so, truncate refs section at first figure/table start.
-	set the mfigS to be the fig start to star methods start.
-	Assume:
-	    self.starS is set appropriately
-	    self.mfigS is initialized to be length 0 at end of self.extText
-	"""
-	section = self.mfigS
-	matches   = self.matcher.getMatches(self.MANUSCRIPT_FIGURES)
-
-	figMatch = None		# the match of the 1st fig after refs start
-	for m in matches:
-	    if self.refsS.sPos < m.sPos and m.sPos < self.starS.sPos:
-		figMatch = m
-		break
-	if figMatch:		# got a fig after refs & before any starS
-	    section.reason  = figMatch.text
-	    section.sPos    = figMatch.sPos
-	    section.ePos    = self.starS.sPos
-	    section.text    = self.extText[section.sPos : section.ePos]
-
-	    self.refsS.ePos = figMatch.sPos	# adjust end of refs section
-	    self.refsS.text = self.extText[self.refsS.sPos: self.refsS.ePos]
-	else:			# no fig match
-	    section.sPos = self.starS.sPos
-	    section.ePos = self.starS.sPos
-	    # section.text & reason should be set ok from initSections
-	return
+        """
+        Look forward from refs section to star methods & see if any
+        figures/tables in between. 
+        If so, truncate refs section at first figure/table start.
+        set the mfigS to be the fig start to star methods start.
+        Assume:
+            self.starS is set appropriately
+            self.mfigS is initialized to be length 0 at end of self.extText
+        """
+        section = self.mfigS
+        matches   = self.matcher.getMatches(self.MANUSCRIPT_FIGURES)
+
+        figMatch = None		# the match of the 1st fig after refs start
+        for m in matches:
+            if self.refsS.sPos < m.sPos and m.sPos < self.starS.sPos:
+                figMatch = m
+                break
+        if figMatch:		# got a fig after refs & before any starS
+            section.reason  = figMatch.text
+            section.sPos    = figMatch.sPos
+            section.ePos    = self.starS.sPos
+            section.text    = self.extText[section.sPos : section.ePos]
+
+            self.refsS.ePos = figMatch.sPos	# adjust end of refs section
+            self.refsS.text = self.extText[self.refsS.sPos: self.refsS.ePos]
+        else:			# no fig match
+            section.sPos = self.starS.sPos
+            section.ePos = self.starS.sPos
+            # section.text & reason should be set ok from initSections
+        return
     # ----------------------------------
 
     def findBodySection(self):
-	section = self.bodyS
-	section.sPos = 0
-	section.ePos = self.refsS.sPos
-	section.text = self.extText[section.sPos : section.ePos]
-	return
+        section = self.bodyS
+        section.sPos = 0
+        section.ePos = self.refsS.sPos
+        section.text = self.extText[section.sPos : section.ePos]
+        return
     # ----------------------------------
 
     def findNotTooLateMatch(self, matches, totalTextLength='default'):
-	"""
-	Given a list of matches, return the 1st one that is not too close
-	to the end.
-	  (at least self.minFactor from textEnd JIM: doc better)
-	Return None if we don't find one
-	"""
-	if totalTextLength == 'default': totalTextLength = self.lenExtText
-	retVal = None
-	for m in matches:
-	    if not self.isTooCloseToEnd(m.sPos, totalTextLength):
-		retVal = m
-		break
-	return retVal
+        """
+        Given a list of matches, return the 1st one that is not too close
+        to the end.
+          (at least self.minFactor from textEnd JIM: doc better)
+        Return None if we don't find one
+        """
+        if totalTextLength == 'default': totalTextLength = self.lenExtText
+        retVal = None
+        for m in matches:
+            if not self.isTooCloseToEnd(m.sPos, totalTextLength):
+                retVal = m
+                break
+        return retVal
     # ----------------------------------
 
     def isTooCloseToEnd(self, sPos, totalTextLength='default'):
-	"""
+        """
         Return Boolean: Is the predicted (section) start position too
-	close to the end to be reasonable?
+        close to the end to be reasonable?
 
-	(if too close, it is likely some text in the PDF page footer)
+        (if too close, it is likely some text in the PDF page footer)
         """
-	if totalTextLength == 'default': totalTextLength = self.lenExtText
+        if totalTextLength == 'default': totalTextLength = self.lenExtText
         sectionLen = totalTextLength - sPos
         sectionLengthFraction = float(sectionLen)/totalTextLength
         return sectionLengthFraction < self.minFraction
@@ -493,114 +492,114 @@ class TypedMatch (object):
     Represents a match from a TypedRegexMatcher.
     """
     def __init__(self, matchType, text, sPos, ePos):
-	self.matchType = matchType	# types from the regexDict passed
-					#  to TypedRegexMatcher
-	self.text      = text		# the string that matched the regex
-	self.sPos      = sPos		# start pos in the text of matching str
-	self.ePos      = ePos		# end pos in the text of matching str
-					#  i.e., 1st index in text after match
+        self.matchType = matchType	# types from the regexDict passed
+                                        #  to TypedRegexMatcher
+        self.text      = text		# the str.that matched the regex
+        self.sPos      = sPos		# start pos in the text of matching str
+        self.ePos      = ePos		# end pos in the text of matching str
+                                        #  i.e., 1st index in text after match
 
     def __str__(self):
-	return "TypedMatched object: %s '%s' %d %d" %  \
-			    (self.matchType, self.text, self.sPos, self.ePos)
+        return "TypedMatched object: %s '%s' %d %d" %  \
+                            (self.matchType, self.text, self.sPos, self.ePos)
 #-----------------------------------------------
 
 class TypedRegexMatcher (object): #{
     """
-    Is: A class that matches a set of regex's against strings.
-	Each regex has a user defined type/category name.
-	Once you have matched against a string, you can get back the matches
-	by type, in the order the matches occur in the string.
+    Is: A class that matches a set of regex's against str..
+        Each regex has a user defined type/category name.
+        Once you have matched against a str. you can get back the matches
+        by type, in the order the matches occur in the str.
 
-	This is built to make one regex match pass over a string once, and
-	yet pull out all the individual matches, by type.
+        This is built to make one regex match pass over a str.once, and
+        yet pull out all the individual matches, by type.
 
     Has: Dict of typed regex's:
-	    {'type 1' : [regex pattern strings...],
-	     'type 2' : [regex pattern strings...],
-	     ...
-	    }
-	 A honking regex built from this dict
+            {'type 1' : [regex pattern str....],
+             'type 2' : [regex pattern str....],
+             ...
+            }
+         A honking regex built from this dict
     Does: match('sometext')
-	  After a match:
-	      getMatches('type'), getAllMatches()
-	      return lists of TypedMatch objects in the order they appear
-	      in 'sometext'
+          After a match:
+              getMatches('type'), getAllMatches()
+              return lists of TypedMatch objects in the order they appear
+              in 'sometext'
     """
     # -----------------------
 
     def __init__(self,
-		 regexDict,		# Dict of regex's as above
-		 startPattern='',	# Regex pattern str to match at the
-		 			#  start of all regex's. 
-					# Use this to force matches at start
-					#  of paragraphs.
-		 flags=re.IGNORECASE,	# Regex flags when matching, see re.
-		 			#  flags=0 to get re module defaults
-		):
-	self.regexDict    = regexDict
-	self.regexTypes   = self.regexDict.keys()
-	self.startPattern = startPattern
-	self.flags        = flags
-
-	self.buildRegexStr()
-	self.regex        = re.compile(self.regexStr, self.flags)
-
-	self.initMatchResults()
+                 regexDict,		# Dict of regex's as above
+                 startPattern='',	# Regex pattern str to match at the
+                                        #  start of all regex's. 
+                                        # Use this to force matches at start
+                                        #  of paragraphs.
+                 flags=re.IGNORECASE,	# Regex flags when matching, see re.
+                                        #  flags=0 to get re module defaults
+                ):
+        self.regexDict    = regexDict
+        self.regexTypes   = list(self.regexDict.keys())
+        self.startPattern = startPattern
+        self.flags        = flags
+
+        self.buildRegexStr()
+        self.regex        = re.compile(self.regexStr, self.flags)
+
+        self.initMatchResults()
     # ----------------------------------
 
     def buildRegexStr(self):
-	"""
-	Set self.refRegex to the honking regex...
-	Each regex type is its own named regex group.
-	"""
-	regexParts = []
-	for tType,regList in self.regexDict.items():
-	    rs = r'(?P<%s>%s)' % ( tType, '|'.join(regList) )
-	    regexParts.append(rs)
-
-	self.regexStr = self.startPattern + '(?:' + '|'.join(regexParts) + ')'
+        """
+        Set self.refRegex to the honking regex...
+        Each regex type is its own named regex group.
+        """
+        regexParts = []
+        for tType,regList in list(self.regexDict.items()):
+            rs = r'(?P<%s>%s)' % ( tType, '|'.join(regList) )
+            regexParts.append(rs)
+
+        self.regexStr = self.startPattern + '(?:' + '|'.join(regexParts) + ')'
     # ----------------------------------
 
     def initMatchResults(self):
-	"""
-	Initialize matchesByType and allMatches to empty lists
-	"""
-	self.matchesByType = {}
-	for t in self.regexTypes:
-	    self.matchesByType[t] = []
-	self.allMatches    = []
+        """
+        Initialize matchesByType and allMatches to empty lists
+        """
+        self.matchesByType = {}
+        for t in self.regexTypes:
+            self.matchesByType[t] = []
+        self.allMatches    = []
     # ----------------------------------
 
     def match(self, text):
-	"""
-	Match the regex's against the text.
-	Return the list of all matches (TypeMatch objects)
-	"""
-	self.initMatchResults()
+        """
+        Match the regex's against the text.
+        Return the list of all matches (TypeMatch objects)
+        """
+        self.initMatchResults()
 
-	for reM in self.regex.finditer(text):	# for the regex Match objects
+        for reM in self.regex.finditer(text):	# for the regex Match objects
 
-	    # for the named groups:
-	    # Note all named groups are in the groupdict,
-	    #   even if there is no match to that group
-	    for mType, mText in reM.groupdict().items():
-		if mText != None: break	# when we find group w/ a value,
-					#  that is the matching group
+            # for the named groups:
+            # Note all named groups are in the groupdict,
+            #   even if there is no match to that group
+            for mType, mText in list(reM.groupdict().items()):
+                if mText != None: break	# when we find group w/ a value,
+                                        #  that is the matching group
 
-	    sPos, ePos = reM.span(mType)
-	    m = TypedMatch(mType, mText, sPos, ePos) # our own match object
+            sPos, ePos = reM.span(mType)
+            m = TypedMatch(mType, mText, sPos, ePos) # our own match object
 
-	    self.allMatches.append(m)
-	    self.matchesByType[mType].append(m)
+            self.allMatches.append(m)
+            self.matchesByType[mType].append(m)
 
-	return self.allMatches
+        return self.allMatches
     # ----------------------------------
 
     def getMatches(self, regexType):
-	if regexType not in self.regexTypes:
-	    raise KeyError("invalid match type '%s'" % regexType)
-	return self.matchesByType[regexType]
+        if regexType not in self.regexTypes:
+            raise KeyError("invalid match type '%s'" % regexType)
+        return self.matchesByType[regexType]
 
     def getAllMatches(self): return self.allMatches
     def getRegexStr(self):   return self.regexStr
@@ -613,68 +612,68 @@ def getRegexTypes(self): return self.regexTypes
 # -----------------------
 if __name__ == "__main__":	# some ad hoc tests
 
-    print "Running ad hoc tests - modify these as needed"
+    print("Running ad hoc tests - modify these as needed")
 
     if False:		# TypedRegexMatcher tests
-	PARA_BOUNDARY = '\n\n'
-	regexDict = {
-	    'animal'  : [spacedOutRegex('duck'),
-			r'\bdog\b'],
-	    'tree'    : ['[oO]ak|fir', 'apple', 'beech',],
-		     }
-	#matcher = TypedRegexMatcher(regexDict, startPattern=PARA_BOUNDARY)
-	matcher = TypedRegexMatcher(regexDict, )
-
-	print matcher.getRegexStr()
-
-	s = 'the \n\ndu ck\n and DoG climbed an Oak tree'
-	#s = 'no matches here tree'
-
-	print len(matcher.match(s))
-	for m in matcher.getAllMatches():
-	    print str(m)
-	print
-
-	for t in regexDict.keys():
-	    print "%s:\n[" % t
-	    for m in matcher.getMatches(t):
-		print str(m)
-	    print '] ------'
-	#x = matcher.getMatches('foo') # test using invalid regex type
+        PARA_BOUNDARY = '\n\n'
+        regexDict = {
+            'animal'  : [spacedOutRegex('duck'),
+                        r'\bdog\b'],
+            'tree'    : ['[oO]ak|fir', 'apple', 'beech',],
+                     }
+        #matcher = TypedRegexMatcher(regexDict, startPattern=PARA_BOUNDARY)
+        matcher = TypedRegexMatcher(regexDict, )
+
+        print(matcher.getRegexStr())
+
+        s = 'the \n\ndu ck\n and DoG climbed an Oak tree'
+        #s = 'no matches here tree'
+
+        print(len(matcher.match(s)))
+        for m in matcher.getAllMatches():
+            print(str(m))
+        print()
+
+        for t in list(regexDict.keys()):
+            print("%s:\n[" % t)
+            for m in matcher.getMatches(t):
+                print(str(m))
+            print('] ------')
+        #x = matcher.getMatches('foo') # test using invalid regex type
 
     if True:		# ExtTextSplitter tests
-	def runSectionTest(sp, doc):
-	    print "--------- doc length: %d" % len(doc)
-	    (bodyS, refsS, mfigS, starS, suppS) = sp.findSections(doc)
-	    print str(bodyS)
-	    print str(refsS)
-	    print str(mfigS)
-	    print str(starS)
-	    print str(suppS)
-	# -------
+        def runSectionTest(sp, doc):
+            print("--------- doc length: %d" % len(doc))
+            (bodyS, refsS, mfigS, starS, suppS) = sp.findSections(doc)
+            print(str(bodyS))
+            print(str(refsS))
+            print(str(mfigS))
+            print(str(starS))
+            print(str(suppS))
+        # -------
 
 #		PARA_BOUNDARY + 'References' + 	\
 #		"\n1234567890" +			\
 #		PARA_BOUNDARY + 'star*methods' + 	\
 #		PARA_BOUNDARY + 'star*methods' + 	\
-	doc = "1234567890" +				\
-		'\nfigure 1: here is a legend' + 	\
-		'\n' + 'references' + 			\
-		"\n1234567890" +			\
-		'\n' + 'conf  licts of int  erest' + 	\
-		"\n1234567890" +			\
-		'\nsupplementary data TABLE 2: here is a legend' + 	\
-		"\n1234567890" +			\
-		'\nfigure 3: here is a legend' + 	\
-		"\n1234567890" +			\
-		'\n' + 'star*methods' + 		\
-		"\n1234567890" +			\
-		'\n' + SUPP_DATA_TAG +			\
-		"\n1234567890"				\
-		'\n' + 'star*methods' + 		\
-		"\n1234567890"
-	#doc = open('6114980.txt', 'r').read()
-	#doc = "1234567890" + PARA_BOUNDARY + 'foo' + "\n1234567890"
-	sp = ExtTextSplitter(maxFraction=0.9, minFraction=.1)
-	#print sp.getRegexMatcher().getRegexStr()
-	runSectionTest(sp, doc)
+        doc = "1234567890" +				\
+                '\nfigure 1: here is a legend' + 	\
+                '\n' + 'references' + 			\
+                "\n1234567890" +			\
+                '\n' + 'conf  licts of int  erest' + 	\
+                "\n1234567890" +			\
+                '\nsupplementary data TABLE 2: here is a legend' + 	\
+                "\n1234567890" +			\
+                '\nfigure 3: here is a legend' + 	\
+                "\n1234567890" +			\
+                '\n' + 'star*methods' + 		\
+                "\n1234567890" +			\
+                '\n' + SUPP_DATA_TAG +			\
+                "\n1234567890"				\
+                '\n' + 'star*methods' + 		\
+                "\n1234567890"
+        #doc = open('6114980.txt', 'r').read()
+        #doc = "1234567890" + PARA_BOUNDARY + 'foo' + "\n1234567890"
+        sp = ExtTextSplitter(maxFraction=0.9, minFraction=.1)
+        #print sp.getRegexMatcher().getRegexStr()
+        runSectionTest(sp, doc)