Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix tokenizer problem with commas inside NMR peaks #18

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions chemdataextractor/common/REG_EXP.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@

# Common regular expressions
# TODO: add br. s inside of the multiplicity
MULTIPLICITY = '^(br\.)?(br.|s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$'
1 change: 1 addition & 0 deletions chemdataextractor/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .REG_EXP import *
12 changes: 10 additions & 2 deletions chemdataextractor/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def records(self):
sent_record = first_sent_records[0]
if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2):
head_def_record = sent_record
head_def_record_i = i
head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well

for record in el.records:
# Keep track of the most recent record with labels
Expand Down Expand Up @@ -215,10 +215,11 @@ def records(self):
continue
else:
# print(record.serialize())
# TODO: check the names and labels, not the whole record
# We have property values but no names or labels... try merge those from previous
if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record):
# head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name)
if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
if head_def_record:
record.names = head_def_record.names
record.labels = head_def_record.labels
Expand Down Expand Up @@ -272,6 +273,13 @@ def records(self):
record.names.append(name)

# Merge records with any shared name/label
# TODO: merging labels into a single record because of an 'and' is not a good idea (this must be done in other part of the code)
temp_record = []
for record in records:
if len(record.labels) <= 1:
temp_record.append(record)

records.models = temp_record
len_l = len(records)
i = 0
while i < (len_l - 1):
Expand Down
6 changes: 5 additions & 1 deletion chemdataextractor/doc/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
from ..parse.mp import MpParser
from ..parse.tg import TgParser
from ..parse.nmr import NmrParser
from ..parse.doi import DoiParser
from ..parse.uvvis import UvvisParser
from ..parse.hrms import HRMSParser
from ..nlp.lexicon import ChemLexicon
from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS
from ..nlp.abbrev import ChemAbbreviationDetector
Expand Down Expand Up @@ -266,7 +268,8 @@ def _repr_html_(self):

class Paragraph(Text):

parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()]
parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(),
ContextParser(), DoiParser(), HRMSParser()]

def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
Expand Down Expand Up @@ -510,6 +513,7 @@ def records(self):
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for parser in self.parsers:
for record in parser.parse(tagged_tokens):
# print(record)
p = record.serialize()
if not p: # TODO: Potential performance issues?
continue
Expand Down
16 changes: 10 additions & 6 deletions chemdataextractor/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@

from .utils import python_2_unicode_compatible


log = logging.getLogger(__name__)


class BaseType(six.with_metaclass(ABCMeta)):

# This is assigned by ModelMeta to match the attribute on the Model
name = None

Expand Down Expand Up @@ -90,7 +88,6 @@ def process(self, value):


class ModelType(BaseType):

def __init__(self, model, **kwargs):
self.model_class = model
self.model_name = self.model_class.__name__
Expand All @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False):


class ListType(BaseType):

def __init__(self, field, default=None, **kwargs):
super(ListType, self).__init__(**kwargs)
self.field = field
Expand Down Expand Up @@ -376,6 +372,11 @@ class NmrSpectrum(BaseModel):
peaks = ListType(ModelType(NmrPeak))


class HRMS(BaseModel):
"""High Resolution Mass Spectrometry"""
chemical_structure = StringType()


class MeltingPoint(BaseModel):
"""A melting point measurement."""
value = StringType()
Expand All @@ -394,6 +395,7 @@ class GlassTransition(BaseModel):
concentration = StringType(contextual=True)
concentration_units = StringType(contextual=True)


class QuantumYield(BaseModel):
"""A quantum yield measurement."""
value = StringType()
Expand Down Expand Up @@ -439,6 +441,8 @@ class Compound(BaseModel):
names = ListType(StringType())
labels = ListType(StringType())
roles = ListType(StringType())
doi = ListType(StringType())
hrms = ListType(ModelType(HRMS))
nmr_spectra = ListType(ModelType(NmrSpectrum))
ir_spectra = ListType(ModelType(IrSpectrum))
uvvis_spectra = ListType(ModelType(UvvisSpectrum))
Expand Down Expand Up @@ -502,8 +506,8 @@ def is_unidentified(self):
def is_id_only(self):
"""Return True if identifier information only."""
for key, value in self.items():
if key not in {'names', 'labels', 'roles'} and value:
if key not in {'names', 'labels', 'roles', 'doi'} and value:
return False
if self.names or self.labels:
if self.names or self.labels or self.doi:
return True
return False
8 changes: 6 additions & 2 deletions chemdataextractor/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,11 @@
from abc import ABCMeta, abstractmethod
import logging
import re

import six

from ..text import bracket_level, GREEK
from ..data import load_model

from ..common import REG_EXP

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -447,6 +446,8 @@ class ChemWordTokenizer(WordTokenizer):
NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U)
#: Don't split on hyphen if prefix or suffix match this regular expression
NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I)

INSIDE_PEAK = re.compile(REG_EXP.MULTIPLICITY + '|^(M?Hz|\d+\.\d+)$')
#: Don't split on hyphen if the prefix is one of these sequences
NO_SPLIT_PREFIX = {
'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber',
Expand Down Expand Up @@ -657,6 +658,9 @@ def _subspan(self, s, span, nextspan):
# Split around colon unless it looks like we're in a chemical name
if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)):
return self._split_span(span, i, 1)
elif char == ',':
if not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)) and (self.INSIDE_PEAK.search(before) or self.INSIDE_PEAK.search(after)):
return self._split_span(span, i, 1)
elif char in {'x', '+', '−'}:
# Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers
if (i == 0 or self._is_number(before)) and self._is_number(after):
Expand Down
9 changes: 6 additions & 3 deletions chemdataextractor/parse/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@

from ..text import HYPHENS


log = logging.getLogger(__name__)


Expand All @@ -30,15 +29,19 @@ def flatten(tokens, start, result):
return result


def join(tokens, start, result):
def join(tokens, start, result, separator=' '):
"""Join tokens into a single string with spaces between."""
texts = []
if len(result) > 0:
for e in result:
for child in e.iter():
if child.text is not None:
texts.append(child.text)
return [E(result[0].tag, ' '.join(texts))]
return [E(result[0].tag, separator.join(texts))]


def join_comma(tokens, start, result):
return join(tokens, start, result, separator=',')


def merge(tokens, start, result):
Expand Down
5 changes: 3 additions & 2 deletions chemdataextractor/parse/cem.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@

label_blacklist = R('^(31P|[12]H|[23]D|15N|14C|[4567890]\d+)$')

prefixed_label = R('^(cis|trans)-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$')
prefixed_label = R('^(cis|trans|[A-Za-z]{,3})-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$')

#: Chemical label. Very permissive - must be used in context to avoid false positives.
strict_chemical_label = Not(label_blacklist) + (alphanumeric | roman_numeral | letter_number | prefixed_label)('label')
Expand Down Expand Up @@ -124,7 +124,8 @@
I('acetone-d6') | I('d6-acetone') | I('chloroform-d') | I('d-chloroform') | I('methanol-d4') | I('d4-methanol') |
I('pyridine-d5') | I('d5-pyridine') | I('DMSO-d6') | I('d6-DMSO') | I('dimethylsulfoxide-d6') | W('C7D8') |
I('d6-dimethylsulfoxide') | W('MeOH-d4') | W('d4-MeOH') | I('DMSO') | I('benzene-d6') | I('d6-benzene') |
I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane')
I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') | I('MeOD-d4') |
I('d4-MeOD')

)

Expand Down
30 changes: 30 additions & 0 deletions chemdataextractor/parse/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from .base import BaseParser
from .elements import W, R, Optional
from ..model import Compound
from .actions import merge


doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() +
R('10[.][0-9]{4,}(?:[.][0-9]+)*') +
W('/') +
R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi')


class DoiParser(BaseParser):
""""""
root = doi

def __init__(self):
pass

def interpret(self, result, start, end):
c = Compound(
doi=result.xpath('./text()')
)

yield c
9 changes: 8 additions & 1 deletion chemdataextractor/parse/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def _parse_tokens(self, tokens, i, actions=True):
class Regex(BaseParserElement):
"""Match token text with regular expression."""

def __init__(self, pattern, flags=0, group=None):
def __init__(self, pattern, flags=0, group=None, min_size=None, max_size=None):
super(Regex, self).__init__()
if isinstance(pattern, six.string_types):
self.regex = re.compile(pattern, flags)
Expand All @@ -266,9 +266,16 @@ def __init__(self, pattern, flags=0, group=None):
self.regex = pattern
self.pattern = pattern.pattern
self.group = group
self.min_size = 0 if min_size is None else min_size
self.max_size = float('inf') if max_size is None else min_size

def _parse_tokens(self, tokens, i, actions=True):
token_text = tokens[i][0]
token_size = len(token_text)

if not (self.min_size <= token_size < self.max_size):
raise ParseException(tokens, i, 'Expected %s, got %s' % (self.pattern, token_text), self)

result = self.regex.search(token_text)
if result:
text = tokens[i][0] if self.group is None else result.group(self.group)
Expand Down
47 changes: 47 additions & 0 deletions chemdataextractor/parse/hrms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import re

from .base import BaseParser
from .elements import OneOrMore, R, Optional, ZeroOrMore, Not
from ..model import Compound, HRMS
from ..utils import first
from .actions import merge

not_separator = '[^\.;,]$'
separator = '[\.;,]'
chem_sign = '[\+\-‐‑⁃‒–—―−-⁻]'
number = R('^\d+(\.\d+)?$')
chemical_name = R('^(([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+' + chem_sign + '?)$', min_size=5)
# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas
chemical_structure_start = (R('(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE))
chemical_structure = (ZeroOrMore(chemical_structure_start + R(not_separator)).hide() + (chemical_name('chemical_structure')) + Optional(R(separator)).hide())
# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound')

# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical')
# experimental = (Optional(W('found')).hide() + number('mass'))('experimental')
exceptions = ((number | R(chem_sign + '$') | R(u'((^found|^\d+)' + separator + '?)$', flags=re.IGNORECASE)) + Optional(R(separator))).hide()

hrms = (R('^.*H.*R.*M.*S.*$').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms')


class HRMSParser(BaseParser):
""""""
root = hrms

def __init__(self):
pass

def interpret(self, result, start, end):
h = HRMS(
chemical_structure=first(result.xpath('./chemical_structure/text()'))
)
c = Compound()
c.hrms.append(h)

yield c
14 changes: 9 additions & 5 deletions chemdataextractor/parse/nmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@

from ..model import Compound, NmrSpectrum, NmrPeak
from ..utils import first
from .actions import join, merge, strip_stop, fix_whitespace
from .actions import join, merge, strip_stop, fix_whitespace, join_comma
from .base import BaseParser
from .common import cc, equals
from .cem import chemical_name, nmr_solvent
from .elements import W, I, T, R, Optional, ZeroOrMore, SkipTo, OneOrMore, Not, Group
from ..common import REG_EXP

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -80,11 +81,14 @@ def strip_delta(tokens, start, result):
shift_error = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?,\d+(\.+\d+)?\.?$'))('shift').add_action(merge)
shift = (shift_range | shift_value | shift_error).add_action(strip_stop).add_action(strip_delta)

split = R('^(br?)?(s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$')
split = R(REG_EXP.MULTIPLICITY)
multiplicity = (OneOrMore(split) + Optional(W('of') + split))('multiplicity').add_action(join)

coupling_value = (number + ZeroOrMore(R('^[,;&]$') + number + Not(W('H'))))('value').add_action(join)
coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling')
coupling_separator = '^[,;&]|and$'
coupling_signature = R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')
coupling_value = (number + ZeroOrMore((Optional(W('Hz')) + R(coupling_separator) + Optional(coupling_signature)).hide() + number + Not(W('H'))))('value').add_action(join_comma)
coupling = (coupling_signature.hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R(
coupling_separator).hide() + coupling_value + W('Hz')('units')))('coupling')

number = (R('^\d+(\.\d+)?[HCNPF]\.?$') | (R('^\d+(\.\d+)?$') + R('^[HCNPF]\.?$')))('number').add_action(merge)

Expand Down Expand Up @@ -143,4 +147,4 @@ def interpret(self, result, start, end):
n.peaks.append(nmr_peak)

c.nmr_spectra.append(n)
yield c
yield c
10 changes: 10 additions & 0 deletions tests/test_parse_cem.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ def test_to_yield_phrase(self):
]
self.do_parse(s, expected)

def test_label_start(self):
s = '1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene (Leu-07)'
expected = ['<cem_phrase><cem><name>1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene</name><label>Leu-07</label></cem></cem_phrase>']
self.do_parse(s, expected)


class TestParseCemHeading(unittest.TestCase):

Expand Down Expand Up @@ -403,6 +408,11 @@ def test_consecutive_headings2(self):
Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.')
)
results = [r.serialize() for r in d.records]
print(results)
print([
{'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']},
{'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']}
])
self.assertEqual(results, [
{'labels': [u'VII'], 'roles': [u'formula']},
{'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}],
Expand Down
Loading