From 74ab31d410c8811252731f108614802fe435b6a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Hern=C3=A1ndez?= Date: Thu, 17 Aug 2017 10:58:53 +0200 Subject: [PATCH] doi extractor working --- chemdataextractor/doc/text.py | 5 +++- chemdataextractor/model.py | 10 +++---- chemdataextractor/parse/doi.py | 30 +++++++++++++++++++++ tests/test_parse_doi.py | 49 ++++++++++++++++++++++++++++++++++ tests/test_parse_nmr.py | 2 +- 5 files changed, 88 insertions(+), 8 deletions(-) create mode 100644 chemdataextractor/parse/doi.py create mode 100644 tests/test_parse_doi.py diff --git a/chemdataextractor/doc/text.py b/chemdataextractor/doc/text.py index f99c762..928559e 100644 --- a/chemdataextractor/doc/text.py +++ b/chemdataextractor/doc/text.py @@ -26,6 +26,7 @@ from ..parse.mp import MpParser from ..parse.tg import TgParser from ..parse.nmr import NmrParser +from ..parse.doi import DoiParser from ..parse.uvvis import UvvisParser from ..nlp.lexicon import ChemLexicon from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS @@ -266,7 +267,8 @@ def _repr_html_(self): class Paragraph(Text): - parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()] + parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), + ContextParser(), DoiParser()] def _repr_html_(self): return '

' + self.text + '

' @@ -510,6 +512,7 @@ def records(self): tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens] for parser in self.parsers: for record in parser.parse(tagged_tokens): + # print(record) p = record.serialize() if not p: # TODO: Potential performance issues? continue diff --git a/chemdataextractor/model.py b/chemdataextractor/model.py index f54666a..bf1a6c6 100644 --- a/chemdataextractor/model.py +++ b/chemdataextractor/model.py @@ -22,12 +22,10 @@ from .utils import python_2_unicode_compatible - log = logging.getLogger(__name__) class BaseType(six.with_metaclass(ABCMeta)): - # This is assigned by ModelMeta to match the attribute on the Model name = None @@ -90,7 +88,6 @@ def process(self, value): class ModelType(BaseType): - def __init__(self, model, **kwargs): self.model_class = model self.model_name = self.model_class.__name__ @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False): class ListType(BaseType): - def __init__(self, field, default=None, **kwargs): super(ListType, self).__init__(**kwargs) self.field = field @@ -394,6 +390,7 @@ class GlassTransition(BaseModel): concentration = StringType(contextual=True) concentration_units = StringType(contextual=True) + class QuantumYield(BaseModel): """A quantum yield measurement.""" value = StringType() @@ -439,6 +436,7 @@ class Compound(BaseModel): names = ListType(StringType()) labels = ListType(StringType()) roles = ListType(StringType()) + doi = ListType(StringType()) nmr_spectra = ListType(ModelType(NmrSpectrum)) ir_spectra = ListType(ModelType(IrSpectrum)) uvvis_spectra = ListType(ModelType(UvvisSpectrum)) @@ -502,8 +500,8 @@ def is_unidentified(self): def is_id_only(self): """Return True if identifier information only.""" for key, value in self.items(): - if key not in {'names', 'labels', 'roles'} and value: + if key not in {'names', 'labels', 'roles', 'doi'} and value: return False - if self.names or self.labels: + if self.names or self.labels or self.doi: return True return False diff --git a/chemdataextractor/parse/doi.py b/chemdataextractor/parse/doi.py new file mode 100644 index 0000000..f7a54cd --- /dev/null +++ b/chemdataextractor/parse/doi.py @@ -0,0 +1,30 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +from .base import BaseParser +from .elements import W, R, Optional +from ..model import StringType, Compound +from .actions import merge + + +doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() + + R('10[.][0-9]{4,}(?:[.][0-9]+)*') + + W('/') + + R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi') + + +class DoiParser(BaseParser): + """""" + root = doi + + def __init__(self): + pass + + def interpret(self, result, start, end): + c = Compound( + doi=result.xpath('./text()') + ) + + yield c diff --git a/tests/test_parse_doi.py b/tests/test_parse_doi.py new file mode 100644 index 0000000..5faec64 --- /dev/null +++ b/tests/test_parse_doi.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- +""" +test_parse_doi +~~~~~~~~~~~~~~ + +Test DOI parser. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import logging +import unittest + +from lxml import etree + +from chemdataextractor.doc.text import Sentence +from chemdataextractor.parse.doi import doi + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +class TestParseDOI(unittest.TestCase): + maxDiff = None + + def do_parse(self, input, expected): + s = Sentence(input) + log.debug(s) + log.debug(s.tagged_tokens) + result = next(doi.scan(s.tagged_tokens))[0] + log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) + self.assertEqual(expected, etree.tostring(result, encoding='unicode')) + + def test_doi1(self): + tests = [ + 'DOI:10.1021/jo101758t', + 'doi:10.3390/molecules201219848\n hello world', + 'Molecules 2015, 20(12), 22272-22285; doi:10.3390/molecules201219846' + ] + values = [ + '10.1021/jo101758t', + '10.3390/molecules201219848', + '10.3390/molecules201219846' + ] + for test, expected in zip(tests, values): + self.do_parse(test, expected) diff --git a/tests/test_parse_nmr.py b/tests/test_parse_nmr.py index cb1ee54..48fcbdf 100644 --- a/tests/test_parse_nmr.py +++ b/tests/test_parse_nmr.py @@ -33,7 +33,7 @@ def do_parse(self, input, expected): log.debug(s.tagged_tokens) result = next(nmr.scan(s.tagged_tokens))[0] log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) - self.assertEqual(etree.tostring(result, encoding='unicode'), expected) + self.assertEqual(expected, etree.tostring(result, encoding='unicode')) def test_nmr1(self): s = '1H NMR (300 MHz, CDCl3), 1.00 (t, J = 7.3 Hz, 3H), 1.50 (m, 2H), 1.77 (m, 2H), 2.42 (s, 3H), ' \