Skip to content

Commit

Permalink
doi extractor working
Browse files Browse the repository at this point in the history
  • Loading branch information
0x33467 committed Aug 17, 2017
1 parent 9353a47 commit 74ab31d
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 8 deletions.
5 changes: 4 additions & 1 deletion chemdataextractor/doc/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ..parse.mp import MpParser
from ..parse.tg import TgParser
from ..parse.nmr import NmrParser
from ..parse.doi import DoiParser
from ..parse.uvvis import UvvisParser
from ..nlp.lexicon import ChemLexicon
from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS
Expand Down Expand Up @@ -266,7 +267,8 @@ def _repr_html_(self):

class Paragraph(Text):

parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()]
parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(),
ContextParser(), DoiParser()]

def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
Expand Down Expand Up @@ -510,6 +512,7 @@ def records(self):
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for parser in self.parsers:
for record in parser.parse(tagged_tokens):
# print(record)
p = record.serialize()
if not p: # TODO: Potential performance issues?
continue
Expand Down
10 changes: 4 additions & 6 deletions chemdataextractor/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@

from .utils import python_2_unicode_compatible


log = logging.getLogger(__name__)


class BaseType(six.with_metaclass(ABCMeta)):

# This is assigned by ModelMeta to match the attribute on the Model
name = None

Expand Down Expand Up @@ -90,7 +88,6 @@ def process(self, value):


class ModelType(BaseType):

def __init__(self, model, **kwargs):
self.model_class = model
self.model_name = self.model_class.__name__
Expand All @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False):


class ListType(BaseType):

def __init__(self, field, default=None, **kwargs):
super(ListType, self).__init__(**kwargs)
self.field = field
Expand Down Expand Up @@ -394,6 +390,7 @@ class GlassTransition(BaseModel):
concentration = StringType(contextual=True)
concentration_units = StringType(contextual=True)


class QuantumYield(BaseModel):
"""A quantum yield measurement."""
value = StringType()
Expand Down Expand Up @@ -439,6 +436,7 @@ class Compound(BaseModel):
names = ListType(StringType())
labels = ListType(StringType())
roles = ListType(StringType())
doi = ListType(StringType())
nmr_spectra = ListType(ModelType(NmrSpectrum))
ir_spectra = ListType(ModelType(IrSpectrum))
uvvis_spectra = ListType(ModelType(UvvisSpectrum))
Expand Down Expand Up @@ -502,8 +500,8 @@ def is_unidentified(self):
def is_id_only(self):
"""Return True if identifier information only."""
for key, value in self.items():
if key not in {'names', 'labels', 'roles'} and value:
if key not in {'names', 'labels', 'roles', 'doi'} and value:
return False
if self.names or self.labels:
if self.names or self.labels or self.doi:
return True
return False
30 changes: 30 additions & 0 deletions chemdataextractor/parse/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from .base import BaseParser
from .elements import W, R, Optional
from ..model import StringType, Compound
from .actions import merge


doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() +
R('10[.][0-9]{4,}(?:[.][0-9]+)*') +
W('/') +
R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi')


class DoiParser(BaseParser):
""""""
root = doi

def __init__(self):
pass

def interpret(self, result, start, end):
c = Compound(
doi=result.xpath('./text()')
)

yield c
49 changes: 49 additions & 0 deletions tests/test_parse_doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
"""
test_parse_doi
~~~~~~~~~~~~~~
Test DOI parser.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import unittest

from lxml import etree

from chemdataextractor.doc.text import Sentence
from chemdataextractor.parse.doi import doi

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)


class TestParseDOI(unittest.TestCase):
maxDiff = None

def do_parse(self, input, expected):
s = Sentence(input)
log.debug(s)
log.debug(s.tagged_tokens)
result = next(doi.scan(s.tagged_tokens))[0]
log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
self.assertEqual(expected, etree.tostring(result, encoding='unicode'))

def test_doi1(self):
tests = [
'DOI:10.1021/jo101758t',
'doi:10.3390/molecules201219848\n hello world',
'Molecules 2015, 20(12), 22272-22285; doi:10.3390/molecules201219846'
]
values = [
'<doi>10.1021/jo101758t</doi>',
'<doi>10.3390/molecules201219848</doi>',
'<doi>10.3390/molecules201219846</doi>'
]
for test, expected in zip(tests, values):
self.do_parse(test, expected)
2 changes: 1 addition & 1 deletion tests/test_parse_nmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def do_parse(self, input, expected):
log.debug(s.tagged_tokens)
result = next(nmr.scan(s.tagged_tokens))[0]
log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
self.assertEqual(etree.tostring(result, encoding='unicode'), expected)
self.assertEqual(expected, etree.tostring(result, encoding='unicode'))

def test_nmr1(self):
s = '1H NMR (300 MHz, CDCl3), 1.00 (t, J = 7.3 Hz, 3H), 1.50 (m, 2H), 1.77 (m, 2H), 2.42 (s, 3H), ' \
Expand Down

0 comments on commit 74ab31d

Please sign in to comment.