mcs07 · 0x33467 · Aug 11, 2017 · Aug 11, 2017 · Aug 11, 2017 · Aug 16, 2017
diff --git a/chemdataextractor/common/REG_EXP.py b/chemdataextractor/common/REG_EXP.py
@@ -0,0 +1,4 @@
+
+# Common regular expressions
+# TODO: add br. s inside of the multiplicity
+MULTIPLICITY = '^(br\.)?(br.|s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$'
diff --git a/chemdataextractor/common/__init__.py b/chemdataextractor/common/__init__.py
@@ -0,0 +1 @@
+from .REG_EXP import *
diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py
@@ -185,7 +185,7 @@ def records(self):
                     sent_record = first_sent_records[0]
                     if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2):
                         head_def_record = sent_record
-                        head_def_record_i = i
+                        head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well
 
             for record in el.records:
                 # Keep track of the most recent record with labels
@@ -215,10 +215,11 @@ def records(self):
                         continue
                     else:
                         # print(record.serialize())
+                        # TODO: check the names and labels, not the whole record
                         # We have property values but no names or labels... try merge those from previous
                         if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record):
                             # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name)
-                            if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
+                            if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)):
                                 if head_def_record:
                                     record.names = head_def_record.names
                                     record.labels = head_def_record.labels
@@ -272,6 +273,13 @@ def records(self):
                         record.names.append(name)
 
         # Merge records with any shared name/label
+        # TODO: merging labels into a single record because of an 'and' is not a good idea (this must be done in other part of the code)
+        temp_record = []
+        for record in records:
+            if len(record.labels) <= 1:
+                temp_record.append(record)
+
+        records.models = temp_record
         len_l = len(records)
         i = 0
         while i < (len_l - 1):

diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py
@@ -26,7 +26,9 @@
 from ..parse.mp import MpParser
 from ..parse.tg import TgParser
 from ..parse.nmr import NmrParser
+from ..parse.doi import DoiParser
 from ..parse.uvvis import UvvisParser
+from ..parse.hrms import HRMSParser
 from ..nlp.lexicon import ChemLexicon
 from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS
 from ..nlp.abbrev import ChemAbbreviationDetector
@@ -266,7 +268,8 @@ def _repr_html_(self):
 
 class Paragraph(Text):
 
-    parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()]
+    parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(),
+               ContextParser(), DoiParser(), HRMSParser()]
 
     def _repr_html_(self):
         return '<p class="cde-paragraph">' + self.text + '</p>'
@@ -510,6 +513,7 @@ def records(self):
         tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
         for parser in self.parsers:
             for record in parser.parse(tagged_tokens):
+                # print(record)
                 p = record.serialize()
                 if not p:  # TODO: Potential performance issues?
                     continue

diff --git a/chemdataextractor/model.py b/chemdataextractor/model.py
@@ -22,12 +22,10 @@
 
 from .utils import python_2_unicode_compatible
 
-
 log = logging.getLogger(__name__)
 
 
 class BaseType(six.with_metaclass(ABCMeta)):
-
     # This is assigned by ModelMeta to match the attribute on the Model
     name = None
 
@@ -90,7 +88,6 @@ def process(self, value):
 
 
 class ModelType(BaseType):
-
     def __init__(self, model, **kwargs):
         self.model_class = model
         self.model_name = self.model_class.__name__
@@ -102,7 +99,6 @@ def serialize(self, value, primitive=False):
 
 
 class ListType(BaseType):
-
     def __init__(self, field, default=None, **kwargs):
         super(ListType, self).__init__(**kwargs)
         self.field = field
@@ -376,6 +372,11 @@ class NmrSpectrum(BaseModel):
     peaks = ListType(ModelType(NmrPeak))
 
 
+class HRMS(BaseModel):
+    """High Resolution Mass Spectrometry"""
+    chemical_structure = StringType()
+
+
 class MeltingPoint(BaseModel):
     """A melting point measurement."""
     value = StringType()
@@ -394,6 +395,7 @@ class GlassTransition(BaseModel):
     concentration = StringType(contextual=True)
     concentration_units = StringType(contextual=True)
 
+
 class QuantumYield(BaseModel):
     """A quantum yield measurement."""
     value = StringType()
@@ -439,6 +441,8 @@ class Compound(BaseModel):
     names = ListType(StringType())
     labels = ListType(StringType())
     roles = ListType(StringType())
+    doi = ListType(StringType())
+    hrms = ListType(ModelType(HRMS))
     nmr_spectra = ListType(ModelType(NmrSpectrum))
     ir_spectra = ListType(ModelType(IrSpectrum))
     uvvis_spectra = ListType(ModelType(UvvisSpectrum))
@@ -502,8 +506,8 @@ def is_unidentified(self):
     def is_id_only(self):
         """Return True if identifier information only."""
         for key, value in self.items():
-            if key not in {'names', 'labels', 'roles'} and value:
+            if key not in {'names', 'labels', 'roles', 'doi'} and value:
                 return False
-        if self.names or self.labels:
+        if self.names or self.labels or self.doi:
             return True
         return False
diff --git a/chemdataextractor/nlp/tokenize.py b/chemdataextractor/nlp/tokenize.py
@@ -14,12 +14,11 @@
 from abc import ABCMeta, abstractmethod
 import logging
 import re
-
 import six
 
 from ..text import bracket_level, GREEK
 from ..data import load_model
-
+from ..common import REG_EXP
 
 log = logging.getLogger(__name__)
 
@@ -447,6 +446,8 @@ class ChemWordTokenizer(WordTokenizer):
     NO_SPLIT_PREFIX_ENDING = re.compile('(^\(.*\)|^[\d,\'"“”„‟‘’‚‛`´′″‴‵‶‷⁗Α-Ωα-ω]+|ano|ato|azo|boc|bromo|cbz|chloro|eno|fluoro|fmoc|ido|ino|io|iodo|mercapto|nitro|ono|oso|oxalo|oxo|oxy|phospho|telluro|tms|yl|ylen|ylene|yliden|ylidene|ylidyn|ylidyne)$', re.U)
     #: Don't split on hyphen if prefix or suffix match this regular expression
     NO_SPLIT_CHEM = re.compile('([\-α-ω]|\d+,\d+|\d+[A-Z]|^d\d\d?$|acetic|acetyl|acid|acyl|anol|azo|benz|bromo|carb|cbz|chlor|cyclo|ethan|ethyl|fluoro|fmoc|gluc|hydro|idyl|indol|iene|ione|iodo|mercapto|n,n|nitro|noic|o,o|oxalo|oxo|oxy|oyl|onyl|phen|phth|phospho|pyrid|telluro|tetra|tms|ylen|yli|zole|alpha|beta|gamma|delta|epsilon|theta|kappa|lambda|sigma|omega)', re.U | re.I)
+
+    INSIDE_PEAK = re.compile(REG_EXP.MULTIPLICITY + '|^(M?Hz|\d+\.\d+)$')
     #: Don't split on hyphen if the prefix is one of these sequences
     NO_SPLIT_PREFIX = {
         'e', 'a', 'u', 'x', 'agro', 'ante', 'anti', 'arch', 'be', 'bi', 'bio', 'co', 'counter', 'cross', 'cyber',
@@ -657,6 +658,9 @@ def _subspan(self, s, span, nextspan):
                 # Split around colon unless it looks like we're in a chemical name
                 if not (before and after and after[0].isdigit() and before.rstrip('′\'')[-1:].isdigit() and '-' in after) and not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)):
                     return self._split_span(span, i, 1)
+            elif char == ',':
+                if not (self.NO_SPLIT_CHEM.search(before) and self.NO_SPLIT_CHEM.search(after)) and (self.INSIDE_PEAK.search(before) or self.INSIDE_PEAK.search(after)):
+                    return self._split_span(span, i, 1)
             elif char in {'x', '+', '−'}:
                 # Split around x, +, − (\u2212 minus) between two numbers or at start followed by numbers
                 if (i == 0 or self._is_number(before)) and self._is_number(after):

diff --git a/chemdataextractor/parse/actions.py b/chemdataextractor/parse/actions.py
@@ -19,7 +19,6 @@
 
 from ..text import HYPHENS
 
-
 log = logging.getLogger(__name__)
 
 
@@ -30,15 +29,19 @@ def flatten(tokens, start, result):
     return result
 
 
-def join(tokens, start, result):
+def join(tokens, start, result, separator=' '):
     """Join tokens into a single string with spaces between."""
     texts = []
     if len(result) > 0:
         for e in result:
             for child in e.iter():
                 if child.text is not None:
                     texts.append(child.text)
-        return [E(result[0].tag, ' '.join(texts))]
+        return [E(result[0].tag, separator.join(texts))]
+
+
+def join_comma(tokens, start, result):
+    return join(tokens, start, result, separator=',')
 
 
 def merge(tokens, start, result):

diff --git a/chemdataextractor/parse/cem.py b/chemdataextractor/parse/cem.py
@@ -53,7 +53,7 @@
 
 label_blacklist = R('^(31P|[12]H|[23]D|15N|14C|[4567890]\d+)$')
 
-prefixed_label = R('^(cis|trans)-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$')
+prefixed_label = R('^(cis|trans|[A-Za-z]{,3})-((d-)?(\d{1,2}[A-Za-z]{0,2}[′″‴‶‷⁗]?)(-d)?|[LS]\d\d?)$')
 
 #: Chemical label. Very permissive - must be used in context to avoid false positives.
 strict_chemical_label = Not(label_blacklist) + (alphanumeric | roman_numeral | letter_number | prefixed_label)('label')
@@ -124,7 +124,8 @@
     I('acetone-d6') | I('d6-acetone') | I('chloroform-d') | I('d-chloroform') | I('methanol-d4') | I('d4-methanol') |
     I('pyridine-d5') | I('d5-pyridine') | I('DMSO-d6') | I('d6-DMSO') | I('dimethylsulfoxide-d6') | W('C7D8') |
     I('d6-dimethylsulfoxide') | W('MeOH-d4') | W('d4-MeOH') | I('DMSO') | I('benzene-d6') | I('d6-benzene') |
-    I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane')
+    I('1,1,2,2-tetrachloroethane-d2') | I('tetrachloroethane-d2') | I('d2-tetrachloroethane') | I('MeOD-d4') |
+    I('d4-MeOD')
 
 )
 

diff --git a/chemdataextractor/parse/doi.py b/chemdataextractor/parse/doi.py
@@ -0,0 +1,30 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .base import BaseParser
+from .elements import W, R, Optional
+from ..model import Compound
+from .actions import merge
+
+
+doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() +
+       R('10[.][0-9]{4,}(?:[.][0-9]+)*') +
+       W('/') +
+       R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi')
+
+
+class DoiParser(BaseParser):
+    """"""
+    root = doi
+
+    def __init__(self):
+        pass
+
+    def interpret(self, result, start, end):
+        c = Compound(
+            doi=result.xpath('./text()')
+        )
+
+        yield c
diff --git a/chemdataextractor/parse/elements.py b/chemdataextractor/parse/elements.py
@@ -257,7 +257,7 @@ def _parse_tokens(self, tokens, i, actions=True):
 class Regex(BaseParserElement):
     """Match token text with regular expression."""
 
-    def __init__(self, pattern, flags=0, group=None):
+    def __init__(self, pattern, flags=0, group=None, min_size=None, max_size=None):
         super(Regex, self).__init__()
         if isinstance(pattern, six.string_types):
             self.regex = re.compile(pattern, flags)
@@ -266,9 +266,16 @@ def __init__(self, pattern, flags=0, group=None):
             self.regex = pattern
             self.pattern = pattern.pattern
         self.group = group
+        self.min_size = 0 if min_size is None else min_size
+        self.max_size = float('inf') if max_size is None else min_size
 
     def _parse_tokens(self, tokens, i, actions=True):
         token_text = tokens[i][0]
+        token_size = len(token_text)
+
+        if not (self.min_size <= token_size < self.max_size):
+            raise ParseException(tokens, i, 'Expected %s, got %s' % (self.pattern, token_text), self)
+
         result = self.regex.search(token_text)
         if result:
             text = tokens[i][0] if self.group is None else result.group(self.group)

diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import re
+
+from .base import BaseParser
+from .elements import OneOrMore, R, Optional, ZeroOrMore, Not
+from ..model import Compound, HRMS
+from ..utils import first
+from .actions import merge
+
+not_separator = '[^\.;,]$'
+separator = '[\.;,]'
+chem_sign = '[\+\-‐‑⁃‒–—―−－⁻]'
+number = R('^\d+(\.\d+)?$')
+chemical_name = R('^(([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+' + chem_sign + '?)$', min_size=5)
+# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas
+chemical_structure_start = (R('(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE))
+chemical_structure = (ZeroOrMore(chemical_structure_start + R(not_separator)).hide() + (chemical_name('chemical_structure')) + Optional(R(separator)).hide())
+# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound')
+
+# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical')
+# experimental = (Optional(W('found')).hide() + number('mass'))('experimental')
+exceptions = ((number | R(chem_sign + '$') | R(u'((^found|^\d+)' + separator + '?)$', flags=re.IGNORECASE)) + Optional(R(separator))).hide()
+
+hrms = (R('^.*H.*R.*M.*S.*$').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms')
+
+
+class HRMSParser(BaseParser):
+    """"""
+    root = hrms
+
+    def __init__(self):
+        pass
+
+    def interpret(self, result, start, end):
+        h = HRMS(
+            chemical_structure=first(result.xpath('./chemical_structure/text()'))
+        )
+        c = Compound()
+        c.hrms.append(h)
+
+        yield c
diff --git a/chemdataextractor/parse/nmr.py b/chemdataextractor/parse/nmr.py
@@ -19,11 +19,12 @@
 
 from ..model import Compound, NmrSpectrum, NmrPeak
 from ..utils import first
-from .actions import join, merge, strip_stop, fix_whitespace
+from .actions import join, merge, strip_stop, fix_whitespace, join_comma
 from .base import BaseParser
 from .common import cc, equals
 from .cem import chemical_name, nmr_solvent
 from .elements import W, I, T, R, Optional, ZeroOrMore, SkipTo, OneOrMore, Not, Group
+from ..common import REG_EXP
 
 log = logging.getLogger(__name__)
 
@@ -80,11 +81,14 @@ def strip_delta(tokens, start, result):
 shift_error = (Optional(R('^[\-–−‒]$')) + R('^δ?[\+\-–−‒]?\d+(\.+\d+)?,\d+(\.+\d+)?\.?$'))('shift').add_action(merge)
 shift = (shift_range | shift_value | shift_error).add_action(strip_stop).add_action(strip_delta)
 
-split = R('^(br?)?(s|S|d|D|t|T|q|Q|quint|sept|m|M|dd|ddd|dt|td|tt|br|bs|sb|h|ABq|broad|singlet|doublet|triplet|qua(rtet)?|quintet|septet|multiplet|multiple|peaks)$')
+split = R(REG_EXP.MULTIPLICITY)
 multiplicity = (OneOrMore(split) + Optional(W('of') + split))('multiplicity').add_action(join)
 
-coupling_value = (number + ZeroOrMore(R('^[,;&]$') + number + Not(W('H'))))('value').add_action(join)
-coupling = ((R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')).hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R('^[,;&]$').hide() + coupling_value + W('Hz')('units')))('coupling')
+coupling_separator = '^[,;&]|and$'
+coupling_signature = R('^\d?J([HCNPFD\d,]*|cis|trans)$') + Optional(R('^[\-–−‒]$') + R('^[HCNPF\d]$')) + Optional('=')
+coupling_value = (number + ZeroOrMore((Optional(W('Hz')) + R(coupling_separator) + Optional(coupling_signature)).hide() + number + Not(W('H'))))('value').add_action(join_comma)
+coupling = (coupling_signature.hide() + coupling_value + Optional(W('Hz')('units')) + ZeroOrMore(R(
+    coupling_separator).hide() + coupling_value + W('Hz')('units')))('coupling')
 
 number = (R('^\d+(\.\d+)?[HCNPF]\.?$') | (R('^\d+(\.\d+)?$') + R('^[HCNPF]\.?$')))('number').add_action(merge)
 
@@ -143,4 +147,4 @@ def interpret(self, result, start, end):
             n.peaks.append(nmr_peak)
 
         c.nmr_spectra.append(n)
-        yield c
+        yield c
diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py
@@ -170,6 +170,11 @@ def test_to_yield_phrase(self):
         ]
         self.do_parse(s, expected)
 
+    def test_label_start(self):
+        s = '1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene (Leu-07)'
+        expected = ['<cem_phrase><cem><name>1,3,5-Tricyano-2,4,6-tris(2-dimethylaminovinyl)benzene</name><label>Leu-07</label></cem></cem_phrase>']
+        self.do_parse(s, expected)
+
 
 class TestParseCemHeading(unittest.TestCase):
 
@@ -403,6 +408,11 @@ def test_consecutive_headings2(self):
             Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.')
         )
         results = [r.serialize() for r in d.records]
+        print(results)
+        print([
+            {'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']},
+            {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']}
+        ])
         self.assertEqual(results, [
             {'labels': [u'VII'], 'roles': [u'formula']},
             {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}],