Skip to content

Commit

Permalink
Don’t return empty CEM spans - fixes #12
Browse files Browse the repository at this point in the history
Ensure that the CemTagger considers tokens entirely made up of ignore prefix/suffix as in the stoplist. Ensure that the Document cems method skips any spans that have been reduced to nothing by boundary adjustments.
  • Loading branch information
mcs07 committed Jan 22, 2017
1 parent 8b54517 commit 3a7bc53
Show file tree
Hide file tree
Showing 3 changed files with 60 additions and 7 deletions.
16 changes: 16 additions & 0 deletions chemdataextractor/doc/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,10 @@ def cems(self):
end -= 1
break

# If entity has been reduced to nothing by adjusting boundaries, skip it
if start >= end:
continue

currenttext = self.text[start-self.start:end-self.start]

# Do splits
Expand Down Expand Up @@ -557,6 +561,18 @@ def __repr__(self):
def __str__(self):
return self.text

def __eq__(self, other):
"""Span objects are equal if the source text is equal, and the start and end indices are equal."""
if not isinstance(other, self.__class__):
return False
return self.text == other.text and self.start == other.start and self.end == other.end

def __ne__(self, other):
return not self == other

def __hash__(self):
return hash((self.text, self.start, self.end))

@property
def length(self):
"""The offset length of this span in the original text."""
Expand Down
24 changes: 19 additions & 5 deletions chemdataextractor/nlp/cem.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import logging
import re

import six

from ..text import bracket_level
from .lexicon import ChemLexicon
from .tag import BaseTagger, CrfTagger, DictionaryTagger
Expand Down Expand Up @@ -517,12 +519,24 @@ class CemTagger(BaseTagger):

def _in_stoplist(self, entity):
"""Return True if the entity is in the stoplist."""
for suffix in IGNORE_SUFFIX:
if entity.endswith(suffix):
entity = entity[:-len(suffix)]
start = 0
end = len(entity)
# Adjust boundaries to exclude disallowed prefixes/suffixes
for prefix in IGNORE_PREFIX:
if entity.startswith(prefix):
entity = entity[len(prefix):]
# print('%s removing %s' % (currenttext, prefix))
start += len(prefix)
break
for suffix in IGNORE_SUFFIX:
if entity.endswith(suffix):
# print('%s removing %s' % (currenttext, suffix))
end -= len(suffix)
break
# Return True if entity has been reduced to nothing by adjusting boundaries
if start >= end:
return True
# Return True if adjusted entity is in the literal stoplist
entity = entity[start:end]
if entity in STOPLIST:
return True
# log.debug('Entity: %s', entity)
Expand Down Expand Up @@ -598,5 +612,5 @@ def tag(self, tokens):
if re.match('^(\d{1,2}[A-Za-z]?|I|II|III|IV|V|VI|VII|VIII|IX)$', entity_tokens[-2]):
log.debug('Removing %s from end of CEM', entity_tokens[-2])
tags[end_i-3:end_i] = [None, None, None]
tokentags = zip(tokens, tags)
tokentags = list(six.moves.zip(tokens, tags))
return tokentags
27 changes: 25 additions & 2 deletions tests/test_nlp_cem.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
import logging
import unittest

from chemdataextractor.nlp.cem import CiDictCemTagger, CrfCemTagger

from chemdataextractor.doc import Span, Document
from chemdataextractor.nlp.cem import CiDictCemTagger, CrfCemTagger, CemTagger

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)
Expand Down Expand Up @@ -69,6 +69,29 @@ def test_unicode_combining_characters(self):
)


class TestCemTagger(unittest.TestCase):
"""Test combined CemTagger."""

def test_stoplist(self):
"""Test CemTagger removes words in stoplist, including words entirely made up of ignore prefix/suffix.
GitHub issue #12.
"""
ct = CemTagger()
self.assertEqual([(('benzene-aromatic', 'NN'), 'B-CM')], ct.tag([('benzene-aromatic', 'NN')]))
self.assertEqual([(('-aromatic', 'JJ'), None)], ct.tag([('-aromatic', 'JJ')]))
self.assertEqual([(('non-aromatic', 'JJ'), None)], ct.tag([('non-aromatic', 'JJ')]))

def test_cems_stoplist(self):
"""Test Document cems removes words in stoplist, ncluding words entirely made up of ignore prefix/suffix.
GitHub issue #12.
"""
self.assertEqual([Span('benzene', 0, 7)], Document('benzene-aromatic').cems)
self.assertEqual([], Document('-aromatic').cems)
self.assertEqual([], Document('non-aromatic').cems)


# TODO: Test entity recognition on a sentence containing a generic abbreviation that is only picked up through its definition


Expand Down

0 comments on commit 3a7bc53

Please sign in to comment.