Don’t return empty CEM spans - fixes #12

Ensure that the CemTagger considers tokens entirely made up of ignore prefix/suffix as in the stoplist. Ensure that the Document cems method skips any spans that have been reduced to nothing by boundary adjustments.
mcs07 · Jan 22, 2017 · 3a7bc53 · 3a7bc53
1 parent 8b54517
commit 3a7bc53
Show file tree

Hide file tree

Showing 3 changed files with 60 additions and 7 deletions.
diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py
@@ -451,6 +451,10 @@ def cems(self):
                             end -= 1
                             break
 
+            # If entity has been reduced to nothing by adjusting boundaries, skip it
+            if start >= end:
+                continue
+
             currenttext = self.text[start-self.start:end-self.start]
 
             # Do splits
@@ -557,6 +561,18 @@ def __repr__(self):
     def __str__(self):
         return self.text
 
+    def __eq__(self, other):
+        """Span objects are equal if the source text is equal, and the start and end indices are equal."""
+        if not isinstance(other, self.__class__):
+            return False
+        return self.text == other.text and self.start == other.start and self.end == other.end
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash((self.text, self.start, self.end))
+
     @property
     def length(self):
         """The offset length of this span in the original text."""

diff --git a/chemdataextractor/nlp/cem.py b/chemdataextractor/nlp/cem.py
@@ -16,6 +16,8 @@
 import logging
 import re
 
+import six
+
 from ..text import bracket_level
 from .lexicon import ChemLexicon
 from .tag import BaseTagger, CrfTagger, DictionaryTagger
@@ -517,12 +519,24 @@ class CemTagger(BaseTagger):
 
     def _in_stoplist(self, entity):
         """Return True if the entity is in the stoplist."""
-        for suffix in IGNORE_SUFFIX:
-            if entity.endswith(suffix):
-                entity = entity[:-len(suffix)]
+        start = 0
+        end = len(entity)
+        # Adjust boundaries to exclude disallowed prefixes/suffixes
         for prefix in IGNORE_PREFIX:
             if entity.startswith(prefix):
-                entity = entity[len(prefix):]
+                # print('%s removing %s' % (currenttext, prefix))
+                start += len(prefix)
+                break
+        for suffix in IGNORE_SUFFIX:
+            if entity.endswith(suffix):
+                # print('%s removing %s' % (currenttext, suffix))
+                end -= len(suffix)
+                break
+        # Return True if entity has been reduced to nothing by adjusting boundaries
+        if start >= end:
+            return True
+        # Return True if adjusted entity is in the literal stoplist
+        entity = entity[start:end]
         if entity in STOPLIST:
             return True
         # log.debug('Entity: %s', entity)
@@ -598,5 +612,5 @@ def tag(self, tokens):
                             if re.match('^(\d{1,2}[A-Za-z]?|I|II|III|IV|V|VI|VII|VIII|IX)$', entity_tokens[-2]):
                                 log.debug('Removing %s from end of CEM', entity_tokens[-2])
                                 tags[end_i-3:end_i] = [None, None, None]
-        tokentags = zip(tokens, tags)
+        tokentags = list(six.moves.zip(tokens, tags))
         return tokentags
diff --git a/tests/test_nlp_cem.py b/tests/test_nlp_cem.py
@@ -17,8 +17,8 @@
 import logging
 import unittest
 
-from chemdataextractor.nlp.cem import CiDictCemTagger, CrfCemTagger
-
+from chemdataextractor.doc import Span, Document
+from chemdataextractor.nlp.cem import CiDictCemTagger, CrfCemTagger, CemTagger
 
 logging.basicConfig(level=logging.DEBUG)
 log = logging.getLogger(__name__)
@@ -69,6 +69,29 @@ def test_unicode_combining_characters(self):
             )
 
 
+class TestCemTagger(unittest.TestCase):
+    """Test combined CemTagger."""
+
+    def test_stoplist(self):
+        """Test CemTagger removes words in stoplist, including words entirely made up of ignore prefix/suffix.
+
+        GitHub issue #12.
+        """
+        ct = CemTagger()
+        self.assertEqual([(('benzene-aromatic', 'NN'), 'B-CM')], ct.tag([('benzene-aromatic', 'NN')]))
+        self.assertEqual([(('-aromatic', 'JJ'), None)], ct.tag([('-aromatic', 'JJ')]))
+        self.assertEqual([(('non-aromatic', 'JJ'), None)], ct.tag([('non-aromatic', 'JJ')]))
+
+    def test_cems_stoplist(self):
+        """Test Document cems removes words in stoplist, ncluding words entirely made up of ignore prefix/suffix.
+
+        GitHub issue #12.
+        """
+        self.assertEqual([Span('benzene', 0, 7)], Document('benzene-aromatic').cems)
+        self.assertEqual([], Document('-aromatic').cems)
+        self.assertEqual([], Document('non-aromatic').cems)
+
+
 # TODO: Test entity recognition on a sentence containing a generic abbreviation that is only picked up through its definition