Merge pull request CenterForOpenScience#369 from CenterForOpenScience…

…/feature/doctests Add support for doctests
NeuroVault · Sep 17, 2015 · 8c91643 · 8c91643
2 parents ad3ee08 + a962a13
commit 8c91643
Show file tree

Hide file tree

Showing 18 changed files with 151 additions and 133 deletions.
diff --git a/scrapi/base/__init__.py b/scrapi/base/__init__.py
@@ -21,7 +21,7 @@
     build_properties,
     oai_get_records_and_token,
     compose,
-    date_formatter,
+    datetime_formatter,
     null_on_error,
     coerce_to_list
 )
@@ -152,7 +152,7 @@ def formatted_properties(self):
 
     def format_property(self, property):
         if property == 'date':
-            fn = compose(lambda x: map(null_on_error(date_formatter), x), coerce_to_list, self.resolve_property)
+            fn = compose(lambda x: map(null_on_error(datetime_formatter), x), coerce_to_list, self.resolve_property)
         else:
             fn = self.resolve_property
         return (property, (

diff --git a/scrapi/base/helpers.py b/scrapi/base/helpers.py
@@ -18,9 +18,19 @@
 URL_REGEX = re.compile(r'(https?:\/\/\S*\.[^\s\[\]\<\>\}\{\^]*)')
 DOI_REGEX = re.compile(r'(doi:10\.\S*)')
 
-''' Takes a value, returns a function that always returns that value
-    Useful inside schemas for defining constants '''
-CONSTANT = lambda x: lambda *_, **__: x
+
+def CONSTANT(x):
+    ''' Takes a value, returns a function that always returns that value
+        Useful inside schemas for defining constants
+
+        >>> CONSTANT(7)('my', 'name', verb='is')
+        7
+        >>> CONSTANT([123, 456])()
+        [123, 456]
+    '''
+    def inner(*y, **z):
+        return x
+    return inner
 
 
 def build_properties(*args):
@@ -48,15 +58,37 @@ def build_property(name, expr, description=None, uri=None):
 
 
 def single_result(l, default=''):
-    return l[0] if l else default
+    ''' A function that will return the first element of a list if it exists
 
+        >>> print(single_result(['hello', None]))
+        hello
+        >>> print(single_result([], default='hello'))
+        hello
+        >>> print(single_result([]))
+        <BLANKLINE>
 
-def compose(*functions):
     '''
-    evaluates functions from right to left.
-    ex. compose(f, g)(*x, **y) = f(g(*x, **y))
+    return l[0] if l else default
+
 
-    credit to sloria
+def compose(*functions):
+    ''' evaluates functions from right to left.
+
+        >>> add = lambda x, y: x + y
+        >>> add3 = lambda x: x + 3
+        >>> divide2 = lambda x: x/2
+        >>> subtract4 = lambda x: x - 4
+        >>> subtract1 = compose(add3, subtract4)
+        >>> subtract1(1)
+        0
+        >>> compose(subtract1, add3)(4)
+        6
+        >>> compose(int, add3, add3, divide2)(4)
+        8
+        >>> compose(int, divide2, add3, add3)(4)
+        5
+        >>> compose(int, divide2, compose(add3, add3), add)(7, 3)
+        8
     '''
     def inner(func1, func2):
         return lambda *x, **y: func1(func2(*x, **y))
@@ -65,8 +97,18 @@ def inner(func1, func2):
 
 def updated_schema(old, new):
     ''' Creates a dictionary resulting from adding all keys/values of the second to the first
+        The second dictionary will overwrite the first.
+
+        >>> old, new = {'name': 'ric', 'job': None}, {'name': 'Rick'}
+        >>> updated = updated_schema(old, new)
+        >>> len(updated.keys())
+        2
+        >>> print(updated['name'])
+        Rick
+        >>> updated['job'] is None
+        True
 
-    The second dictionary will overwrite the first.'''
+    '''
     d = deepcopy(old)
     for key, value in new.items():
         if isinstance(value, dict) and old.get(key) and isinstance(old[key], dict):
@@ -77,18 +119,28 @@ def updated_schema(old, new):
 
 
 def default_name_parser(names):
-    contributor_list = []
-    for person in names:
-        name = HumanName(person)
-        contributor = {
-            'name': person,
-            'givenName': name.first,
-            'additionalName': name.middle,
-            'familyName': name.last,
-        }
-        contributor_list.append(contributor)
+    ''' Takes a list of names, and attempts to parse them
+    '''
+    return list(map(maybe_parse_name, names))
+
+
+def maybe_parse_name(name):
+    ''' Tries to parse a name. If the parsing fails, returns a dictionary
+        with just the unparsed name (as per the SHARE schema)
+    '''
+    return null_on_error(parse_name)(name) or {'name': name}
 
-    return contributor_list
+
+def parse_name(name):
+    ''' Takes a human name, parses it into given/middle/last names
+    '''
+    person = HumanName(name)
+    return {
+        'name': name,
+        'givenName': person.first,
+        'additionalName': person.middle,
+        'familyName': person.last
+    }
 
 
 def format_tags(all_tags, sep=','):
@@ -278,17 +330,34 @@ def inner(*args, **kwargs):
 
 
 def coerce_to_list(thing):
-    ''' If a value is not already a list or tuple, puts that value in a length 1 list'''
+    ''' If a value is not already a list or tuple, puts that value in a length 1 list
+
+        >>> niceties = coerce_to_list('hello')
+        >>> len(niceties)
+        1
+        >>> print(niceties[0])
+        hello
+        >>> niceties2 = coerce_to_list(['hello'])
+        >>> niceties2 == niceties
+        True
+        >>> niceties3 = (coerce_to_list(('hello', 'goodbye')))
+        >>> len(niceties3)
+        2
+        >>> print(niceties3[0])
+        hello
+        >>> print(niceties3[1])
+        goodbye
+    '''
     if not (isinstance(thing, list) or isinstance(thing, tuple)):
         return [thing]
-    return thing
+    return list(thing)
 
 
-def date_formatter(date_string):
+def datetime_formatter(datetime_string):
     '''Takes an arbitrary date/time string and parses it, adds time
     zone information and returns a valid ISO-8601 datetime string
     '''
-    date_time = parser.parse(date_string)
+    date_time = parser.parse(datetime_string)
     if not date_time.tzinfo:
         date_time = date_time.replace(tzinfo=pytz.UTC)
     return date_time.isoformat()
diff --git a/scrapi/base/schemas.py b/scrapi/base/schemas.py
@@ -5,7 +5,7 @@
     format_tags,
     single_result,
     language_codes,
-    date_formatter,
+    datetime_formatter,
     oai_process_uris,
     build_properties,
     default_name_parser,
@@ -17,7 +17,7 @@
     "description": ('//dc:description/node()', compose(lambda x: x.strip(), single_result)),
     "contributors": ('//dc:creator/node()', compose(default_name_parser, lambda x: x.split(';'), single_result)),
     "title": ('//dc:title/node()', compose(lambda x: x.strip(), single_result)),
-    "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(date_formatter, single_result)),
+    "providerUpdatedDateTime": ('//dc:dateEntry/node()', compose(datetime_formatter, single_result)),
     "uris": {
         "canonicalUri": ('//dcq:identifier-citation/node()', compose(lambda x: x.strip(), single_result)),
         "objectUris": [('//dc:doi/node()', compose(lambda x: 'http://dx.doi.org/' + x, single_result))]
@@ -53,7 +53,7 @@
 OAISCHEMA = {
     "contributors": ('//dc:creator/node()', '//dc:contributor/node()', oai_process_contributors),
     "uris": ('//dc:doi/node()', '//dc:identifier/node()', oai_process_uris),
-    'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(date_formatter, single_result)),
+    'providerUpdatedDateTime': ('//ns0:header/ns0:datestamp/node()', compose(datetime_formatter, single_result)),
     'title': ('//dc:title/node()', single_result),
     'description': ('//dc:description/node()', single_result),
     'subjects': ('//dc:subject/node()', format_tags),

diff --git a/scrapi/harvesters/biomedcentral.py b/scrapi/harvesters/biomedcentral.py
@@ -18,7 +18,7 @@
 from scrapi import settings
 from scrapi.base import JSONHarvester
 from scrapi.linter.document import RawDocument
-from scrapi.base.helpers import build_properties, date_formatter
+from scrapi.base.helpers import build_properties, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
@@ -65,7 +65,7 @@ def schema(self):
                 'providerUris': ('/articleFullUrl', '/abstractPath', lambda x, y: [x, y])
             },
             'title': ('/bibliographyTitle', '/blurbTitle', lambda x, y: x or y),
-            'providerUpdatedDateTime': ('/published Date', date_formatter),
+            'providerUpdatedDateTime': ('/published Date', datetime_formatter),
             'description': '/blurbText',
             'freeToRead': {
                 'startDate': ('/is_free', '/published Date', lambda x, y: y if x else None)

diff --git a/scrapi/harvesters/clinicaltrials.py b/scrapi/harvesters/clinicaltrials.py
@@ -21,7 +21,7 @@
 from scrapi.util import copy_to_unicode
 from scrapi.linter.document import RawDocument
 from scrapi.base.schemas import default_name_parser
-from scrapi.base.helpers import compose, single_result, build_properties, date_formatter
+from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
@@ -49,7 +49,7 @@ class ClinicalTrialsHarvester(XMLHarvester):
         "uris": {
             "canonicalUri": ("//required_header/url/node()", single_result)
         },
-        "providerUpdatedDateTime": ("lastchanged_date/node()", compose(date_formatter, single_result)),
+        "providerUpdatedDateTime": ("lastchanged_date/node()", compose(datetime_formatter, single_result)),
         "title": ('//official_title/node()', '//brief_title/node()', lambda x, y: single_result(x) or single_result(y)),
         "description": ('//brief_summary/textblock/node()', '//brief_summary/textblock/node()', lambda x, y: single_result(x) or single_result(y)),
         "tags": ("//keyword/node()", lambda tags: [tag.lower() for tag in tags]),

diff --git a/scrapi/harvesters/crossref.py b/scrapi/harvesters/crossref.py
@@ -18,7 +18,7 @@
 from scrapi import settings
 from scrapi.base import JSONHarvester
 from scrapi.linter.document import RawDocument
-from scrapi.base.helpers import build_properties, compose, date_formatter
+from scrapi.base.helpers import build_properties, compose, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
@@ -72,7 +72,7 @@ def schema(self):
         return {
             'title': ('/title', lambda x: x[0] if x else ''),
             'description': ('/subtitle', lambda x: x[0] if (isinstance(x, list) and x) else x or ''),
-            'providerUpdatedDateTime': ('/issued/date-parts', compose(date_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
+            'providerUpdatedDateTime': ('/issued/date-parts', compose(datetime_formatter, lambda x: ' '.join([str(part) for part in x[0]]))),
             'uris': {
                 'canonicalUri': '/URL'
             },

diff --git a/scrapi/harvesters/dataone.py b/scrapi/harvesters/dataone.py
@@ -5,8 +5,6 @@
 
 from __future__ import unicode_literals
 
-import re
-
 import logging
 from datetime import timedelta, date
 
@@ -23,27 +21,14 @@
 from scrapi.base import XMLHarvester
 from scrapi.util import copy_to_unicode
 from scrapi.linter.document import RawDocument
-from scrapi.base.helpers import compose, single_result, build_properties, date_formatter
+from scrapi.base.helpers import compose, single_result, build_properties, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
 DEFAULT_ENCODING = 'UTF-8'
 DATAONE_SOLR_ENDPOINT = 'https://cn.dataone.org/cn/v1/query/solr/'
 
 
-def process_doi(service_id, doc_doi):
-    doi_re = '10\\.\\d{4}/\\w*\\.\\w*(/\\w*)?'
-
-    doi_list = map(lambda x: x.replace('doi', ''), doc_doi) if isinstance(doc_doi, list) else [doc_doi.replace('doi', '')]
-
-    for item in [service_id] + doi_list:
-        try:
-            return re.search(doi_re, item).group(0)
-        except AttributeError:
-            continue
-    return ''
-
-
 def process_contributors(author, submitters, contributors,
                          investigators):
     if not author:
@@ -149,7 +134,7 @@ class DataOneHarvester(XMLHarvester):
         'contributors': ("str[@name='author']/node()", "str[@name='submitter']/node()", "arr[@name='origin']/str/node()", "arr[@name='investigator']/str/node()", process_contributors),
         'uris': ("str[@name='id']/node()", "//str[@name='dataUrl']/node()", "arr[@name='resourceMap']/str/node()", partial(helpers.oai_process_uris, use_doi=True)),
         'tags': ("//arr[@name='keywords']/str/node()", lambda x: x if isinstance(x, list) else [x]),
-        'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(date_formatter, single_result)),
+        'providerUpdatedDateTime': ("str[@name='dateModified']/node()", compose(datetime_formatter, single_result)),
         'title': ("str[@name='title']/node()", single_result),
         'description': ("str[@name='abstract']/node()", single_result)
     }

diff --git a/scrapi/harvesters/dryad.py b/scrapi/harvesters/dryad.py
@@ -14,20 +14,6 @@
 logger = logging.getLogger(__name__)
 
 
-def format_dois_dryad(*args):
-    prefix = 'http://dx.doi.org/{}'
-    urls = []
-    for arg in args:
-        if isinstance(arg, list):
-            for url in arg:
-                if 'doi:' in url:
-                    urls.append(prefix.format(url.replace('doi:', '')))
-        elif arg:
-            if 'doi:' in arg:
-                urls.append(prefix.format(arg.replace('doi:', '')))
-    return urls
-
-
 class DryadHarvester(OAIHarvester):
     short_name = 'dryad'
     long_name = 'Dryad Data Repository'

diff --git a/scrapi/harvesters/figshare.py b/scrapi/harvesters/figshare.py
@@ -18,7 +18,7 @@
 from scrapi import settings
 from scrapi.base import JSONHarvester
 from scrapi.linter.document import RawDocument
-from scrapi.base.helpers import default_name_parser, build_properties, date_formatter
+from scrapi.base.helpers import default_name_parser, build_properties, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +34,7 @@ class FigshareHarvester(JSONHarvester):
         'title': '/title',
         'description': '/description',
         'contributors': ('/authors', lambda x: default_name_parser([person['author_name'] for person in x])),
-        'providerUpdatedDateTime': ('/modified_date', date_formatter),
+        'providerUpdatedDateTime': ('/modified_date', datetime_formatter),
         'uris': {
             'canonicalUri': ('/DOI', lambda x: x[0] if isinstance(x, list) else x),
             'providerUris': [

diff --git a/scrapi/harvesters/harvarddataverse.py b/scrapi/harvesters/harvarddataverse.py
@@ -17,14 +17,14 @@
 from scrapi import settings
 from scrapi.base import JSONHarvester
 from scrapi.linter.document import RawDocument
-from scrapi.base.helpers import default_name_parser, build_properties, date_formatter
+from scrapi.base.helpers import default_name_parser, build_properties, datetime_formatter
 
 logger = logging.getLogger(__name__)
 
 
 try:
     from scrapi.settings import HARVARD_DATAVERSE_API_KEY
-except ImportError:
+except ImportError:  # pragma: no cover
     HARVARD_DATAVERSE_API_KEY = None
     logger.error('No HARVARD_DATAVERSE_API_KEY found, Harvard Dataverse will always return []')
 
@@ -44,7 +44,7 @@ class HarvardDataverseHarvester(JSONHarvester):
         'title': '/name',
         'description': '/description',
         'contributors': ('/authors', default_name_parser),
-        'providerUpdatedDateTime': ('/published_at', date_formatter),
+        'providerUpdatedDateTime': ('/published_at', datetime_formatter),
         'uris': {
             'canonicalUri': '/url',
             'objectUris': [