Initial commit

mcs07 · Apr 16, 2016 · 945b77b · 945b77b
commit 945b77b
Show file tree

Hide file tree

Showing 105 changed files with 18,028 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,41 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Unit test / coverage reports
+htmlcov/
+.coverage
+.coverage.*
+.cache
+coverage.xml
+
+# Sphinx documentation
+docs/_build/
+
+# IPython Notebook
+.ipynb_checkpoints/
+
+# PyCharm
+.idea/
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright 2016 Matt Swain
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,7 @@
+include README.rst
+include requirements.txt
+include requirements-dev.txt
+include LICENSE
+recursive-include tests *.py
+recursive-include scripts *
+prune docs
diff --git a/README.rst b/README.rst
@@ -0,0 +1,6 @@
+ChemDataExtractor
+=================
+
+ChemDataExtractor is a toolkit for extracting chemical information from the scientific literature.
+
+http://chemdataextractor.org
diff --git a/chemdataextractor/__init__.py b/chemdataextractor/__init__.py
@@ -0,0 +1,28 @@
+# -*- coding: utf-8 -*-
+"""
+ChemDataExtractor
+~~~~~~~~~~~~~~~~~
+
+:copyright: Copyright 2016 by Matt Swain.
+:license: MIT, see LICENSE file for more details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+import logging
+
+
+__title__ = 'ChemDataExtractor'
+__version__ = '1.0.0'
+__author__ = 'Matt Swain'
+__email__ = '[email protected]'
+__license__ = 'MIT'
+__copyright__ = 'Copyright 2016 Matt Swain'
+
+log = logging.getLogger(__name__)
+log.addHandler(logging.NullHandler())
+
+
+from .doc.document import Document
diff --git a/chemdataextractor/biblio/__init__.py b/chemdataextractor/biblio/__init__.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+"""
+chemdataextractor.biblio
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Tools for dealing with bibliographic information.
+
+:copyright: Copyright 2016 by Matt Swain.
+:license: MIT, see LICENSE file for more details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+from .bibtex import BibtexParser, parse_bibtex
+from .person import PersonName
+from .xmp import XmpParser, parse_xmp
diff --git a/chemdataextractor/biblio/bibtex.py b/chemdataextractor/biblio/bibtex.py
@@ -0,0 +1,213 @@
+# -*- coding: utf-8 -*-
+"""
+chemdataextractor.biblio.bibtex
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+BibTeX parser.
+
+:copyright: Copyright 2016 by Matt Swain.
+:license: MIT, see LICENSE file for more details.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+from collections import OrderedDict
+import json
+import re
+
+from ..text.latex import latex_to_unicode
+
+
+class BibtexParser(object):
+    """A class for parsing a BibTeX string into JSON or a python data structure.
+
+    Example usage:
+
+        with open(example.bib, 'r') as f:
+            bib = BibtexParser(f.read())
+            bib.parse()
+            print bib.records_list
+            print bib.json
+    """
+
+    def __init__(self, data, **kwargs):
+        """Initialize BibtexParser with data.
+
+        Optional metadata passed as keyword arguments will be included in the JSON output.
+        e.g. collection, label, description, id, owner, created, modified, source
+
+        Example usage:
+
+            bib = BibtexParser(data, created=unicode(datetime.utcnow()), owner='mcs07')
+
+        """
+        self.data = data
+        self.meta = kwargs
+        self._token = None
+        self.token_type = None
+        self._tokens = re.compile(r'([^\s"\'#%@{}()=,]+|\s|"|\'|#|%|@|{|}|\(|\)|=|,)').finditer(self.data)
+        self.mode = None
+        self.definitions = {}
+        self.records = OrderedDict()
+
+        # Key name normalizations
+        self.keynorms = {
+            u'keyw': u'keyword',
+            u'keywords': u'keyword',
+            u'authors': u'author',
+            u'editors': u'editor',
+            u'url': u'link',
+            u'urls': u'link',
+            u'links': u'link',
+            u'subjects': u'subject'
+        }
+
+    def _next_token(self, skipws=True):
+        """Increment _token to the next token and return it."""
+        self._token = self._tokens.next().group(0)
+        return self._next_token() if skipws and self._token.isspace() else self._token
+
+    def parse(self):
+        """Parse self.data and store the parsed BibTeX to self.records."""
+        while True:
+            try:
+                # TODO: If self._next_token() == '%' skip to newline?
+                if self._next_token() == '@':
+                    self._parse_entry()
+            except StopIteration:
+                break
+
+    def _parse_entry(self):
+        """Parse an entry."""
+        entry_type = self._next_token().lower()
+        if entry_type == 'string':
+            self._parse_string()
+        elif entry_type not in ['comment', 'preamble']:
+            self._parse_record(entry_type)
+
+    def _parse_string(self):
+        """Parse a string entry and store the definition."""
+        if self._next_token() in ['{', '(']:
+            field = self._parse_field()
+            if field:
+                self.definitions[field[0]] = field[1]
+
+    def _parse_record(self, record_type):
+        """Parse a record."""
+        if self._next_token() in ['{', '(']:
+            key = self._next_token()
+            self.records[key] = {
+                u'id': key,
+                u'type': record_type.lower()
+            }
+            if self._next_token() == ',':
+                while True:
+                    field = self._parse_field()
+                    if field:
+                        k, v = field[0], field[1]
+                        if k in self.keynorms:
+                            k = self.keynorms[k]
+                        if k == 'pages':
+                            v = v.replace(' ', '').replace('--', '-')
+                        if k == 'author' or k == 'editor':
+                            v = self.parse_names(v)
+                        # Recapitalizing the title generally causes more problems than it solves
+                        # elif k == 'title':
+                        #     v = latex_to_unicode(v, capitalize='title')
+                        else:
+                            v = latex_to_unicode(v)
+                        self.records[key][k] = v
+                    if self._token != ',':
+                        break
+
+    def _parse_field(self):
+        """Parse a Field."""
+        name = self._next_token()
+        if self._next_token() == '=':
+            value = self._parse_value()
+            return name, value
+
+    def _parse_value(self):
+        """Parse a value. Digits, definitions, and the contents of double quotes or curly brackets."""
+        val = []
+        while True:
+            t = self._next_token()
+            if t == '"':
+                brac_counter = 0
+                while True:
+                    t = self._next_token(skipws=False)
+                    if t == '{':
+                        brac_counter += 1
+                    if t == '}':
+                        brac_counter -= 1
+                    if t == '"' and brac_counter <= 0:
+                        break
+                    else:
+                        val.append(t)
+            elif t == '{':
+                brac_counter = 0
+                while True:
+                    t = self._next_token(skipws=False)
+                    if t == '{':
+                        brac_counter += 1
+                    if t == '}':
+                        brac_counter -= 1
+                    if brac_counter < 0:
+                        break
+                    else:
+                        val.append(t)
+            elif re.match(r'\w', t):
+                val.extend([self.definitions.get(t, t), ' '])
+            elif t.isdigit():
+                val.append([t, ' '])
+            elif t == '#':
+                pass
+            else:
+                break
+
+        value = ' '.join(''.join(val).split())
+        return value
+
+    @classmethod
+    def parse_names(cls, names):
+        """Parse a string of names separated by "and" like in a BibTeX authors field."""
+        names = [latex_to_unicode(n) for n in re.split(r'\sand\s(?=[^{}]*(?:\{|$))', names) if n]
+        return names
+
+    @property
+    def size(self):
+        """Return the number of records parsed."""
+        return len(self.records)
+
+    @property
+    def records_list(self):
+        """Return the records as a list of dictionaries."""
+        return self.records.values()
+
+    @property
+    def metadata(self):
+        """Return metadata for the parsed collection of records."""
+        auto = {u'records': self.size}
+        return dict(auto.items() + self.meta.items())
+
+    @property
+    def json(self):
+        """Return a list of records as a JSON string. Follows the BibJSON convention."""
+        return json.dumps(OrderedDict([('metadata', self.metadata), ('records', self.records.values())]))
+
+
+def parse_bibtex(data):
+    bib = BibtexParser(data)
+    bib.parse()
+    return bib.records_list
+
+
+# TODO: Improvements to BibTexParser
+# - Initialize with options, then pass text to .parse method to reuse an instance?
+# - Initialize with a single entry, and have attributes that correspond to the bibtex fields?
+# - Have a classmethod that takes text containing multiple entries, then returns a list of instances
+# - Have a list wrapper class that allows serialization of all at once?
+
+# TODO: BibtexWriter - write python dict or BibJSON to BibTeX